Recipe 252237 revision 4 « ActiveState Code

#$Id: $

from sets import Set
import re

""" 
From the SWI-Prolog manual:
dwim_match(+Atom1, +Atom2)

       Succeeds if Atom1 matches Atom2 in `Do What I Mean' sense. Both
       Atom1 and Atom2 may also be integers or floats. The two atoms
       match if:

        o They are identical
        o They differ by one character (spy == spu)
        o One character is inserted/deleted (debug == deug)

        o Two characters are transposed (trace == tarce)

        o `Sub-words' are glued differently (existsfile == existsFile
        == exists_file)

        o Two adjacent sub words are transposed (existsFile == fileExists)

Thanks for M.L. Hetland for writing a Levenshtein Distance function in Py.
Thanks to Bijan Parsia for pointing me to SWI-Prolog's DWIM algo.
"""

def dwim_match(s1, s2, degree=1): #passing a degree arg sounds nice, but is
                                  #probably dumb... the rest of the code will
                                  #break if degree != 1... needs to be reworked
    #the easy, obvious case
    if s1 == s2:
        return True

    #covers these cases:
    # - one character of diff
    # - one character inserted/deleted
    if ld(s1, s2) == degree:
        return True

    #transposition is trickier since it's ld == 2; so, maybe:
    if ld(s1, s2) == 2:
        if len(s1) == len(s2):
            return True #this fails on "pat" and "atp"

    #the two subword cases: diff gluings; transp'd adjacents
    w1 = split_words(s1)
    w2 = split_words(s2)

    if w1 and w2: #diff gluings
        if len(w1) == len(w2):
            if Set(map(lambda s: s.lower(), w1)) == Set(map(lambda s: s.lower(), w2)):
                return True #this may cover both subword cases!?
                            #for now, let's say it does....
    #give up
    return False

def split_words(s, other=False):
    # we consider 4 word separator cases:
    #  "_", "-", "+", camelCase; short of running through a dictionary, I don't
    #  know how to do the no separator case: foobar...
    if "_" in s: sep = "_"
    if "-" in s: sep = "-"
    if "+" in s: sep = "+"
    if other and other in s:
        sep = other
    try:
        if sep:
            return s.split(sep)
    except UnboundLocalError:
        return case_splitter(s)

def case_splitter(s):
    pattern = re.compile(r"""([A-Z][a-z]*)""")
    def nullp(str):
        if str != "": return True
        else: return False
    return filter(nullp, pattern.split(s))

def ld(a, b): #stolen from m.l. hetland
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a,b = b,a
        n,m = m,n
    current = xrange(n+1)
    for i in xrange(1,m+1):
        previous, current = current, [i]+[0] * m
        for j in xrange(1, n+1):
            add, delete = previous[j] + 1, current[j-1] + 1
            change = previous[j-1]
            if a[j-1] != b[i-1]:
                change +=1
            current[j] = min(add, delete, change)
    return current[n]

if __name__ == "__main__":


    s1 = s2 = "foobar"

    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)

    s1 = "spy"; s2 = "spu"
    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)
    
    s1 = "debug"; s2 = "deug"
    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)
    
    s1 = "file_exists"; s2 = "file-exists"

    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)
    
    s1 = "file+exists"; s2 = "file-exists"

    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)
    
    s1 = "Bartles"; s2 = "bartles"
    
    print "Testing: %s,  %s" % (s1, s2)
    print dwim_match(s1, s2)
    
    s1 = "fileExists"; s2 = "existsFile"

    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)
    
    s1 = "bartles"; s2 = "james"

    print "Testing: %s, %s" % (s1, s2)
    print dwim_match(s1, s2)
Recipe 252237 revision 4

History

Accounts

Code Recipes

Feedback & Information

ActiveState