Welcome, guest | Sign In | My Account | Store | Cart
# WARNING: Former revisions were not correct in damming greed of the star, they didn't stop
# "[^b]*b*" in xmatch("aaba", "a[^b]*b*aba") to eat up the second and third character 
# returning mismatch. And also greed of ? is existing, which wasn't dammed at all.

# You can edit these constants
# To avoid backslash inflation for example or to use {...} for groups

ANYCHAR = '.'
ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
GROUPA = '(' ; GROUPO = ')'
ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
ALTERNATIVE = '|'
EXTENSION = '(?'

def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]==COMPLEMENT: booly=False; booln=True; i=2
    else: booly=True; booln=False; i=1
    while i < len(pattern):
        if pattern[i]==ESCAPE:
            if pattern[i+1]==ch: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]==RANGE:
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+=2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_simple_token(ch, token):
    c0=token[0]
    if c0==ESCAPE:
        c1=token[1]
        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==c1)    
    elif c0==REGIONA: return match_region(ch,token)
    elif c0==ANYCHAR: return True
    else: return (ch==c0)

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
xGroups = []

def strip_groupattris(s):
    for c in (':', '=', '!'):
        if s.startswith(EXTENSION+c): s=s[3:]; break
    else: s=s[1:]
    if not s.endswith(')'): return s[:-2]
    return s[:-1]

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    All together with eventually trailing quantifiers
    """
    tokens=[]; i=0
    while i < len(pattern):
        c = pattern[i]
        if c==REGIONA:
            k = pattern.find(REGIONO, i)
            if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
            while pattern[k-1] == ESCAPE:
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == ANYCHAR:
            tokens.append(ANYCHAR); i+=1
        elif c == ESCAPE:
            if i<len(pattern)-1: tokens.append(pattern[i:i+2]); i+=2
            else: raise ValueError('Trailing '+ESCAPE)
        elif nested and c==GROUPA:
            resu = GROUPA; k=i+1; lv=1
            while lv > 0:
                cc = pattern[k]
                if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == GROUPA: lv+=1
                elif cc == GROUPO: lv-=1
                resu+=cc; k+=1
            i=k; tokens.append(resu)
        else:
            tokens.append(c); i+=1
# tokens done, quantifiers:
        if i<len(pattern):
            if pattern[i] in (ONEoZERO, ZEROoMORE): tokens[-1] += pattern[i]; i+=1
            elif pattern[i] == ONEoMORE: tokens.append(tokens[-1]+ZEROoMORE); i+=1
# quantifiers done, recursive call for groups:
        if nested:
            ctk = tokens[-1]
            if ctk[0]==GROUPA:
                kore = strip_groupattris(ctk)
                if kore not in TokenListCache.keys():
                    TokenListCache[kore]=parse_pattern(kore, True)
# registration in the TokenListCache:
        if pattern not in TokenListCache.keys(): TokenListCache[pattern]=tokens
    return tokens

def listsplit(larg):
    resu = []; ix = 0
    while ix<len(larg) and ALTERNATIVE in larg[ix:]: 
        ixx = larg.index(ALTERNATIVE); resu.append(larg[ix:ixx]); ix=ixx+1
    resu.append(larg[ix:])
    return resu    

def dam_greed(sarg, cmpi, tokens, next_abs_token, nested):
    if nested and match_token(sarg, cmpi, tokens[next_abs_token]) or\
    not nested and match_simple_token(sarg[cmpi], tokens[next_abs_token]):
        return try_index(sarg, cmpi, tokens[next_abs_token:], nested)
    return 0   
                                
def match_token(sarg, cmpi, token):
    if token[0] != GROUPA: return match_simple_token(sarg[cmpi], token)
    subtokens = listsplit(TokenListCache[strip_groupattris(token)])
    for subtk in subtokens:
        if try_index(sarg, cmpi, subtk, True): return True
    return False

def try_index(sarg, start, tokens, nested):
    cmpi=start; tix=0; tkL=len(tokens)
    next_abs_token = 0
    while tokens[next_abs_token].endswith((ZEROoMORE, ONEoZERO)):
        next_abs_token += 1
        if next_abs_token==tkL: next_abs_token=1; break    
    while tix<tkL:
        if cmpi==len(sarg):
            for s in tokens[tix:]:
                if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
            else: return cmpi
        ctk = tokens[tix]
        if ctk in (ESCAPE+ZEROoMORE, ESCAPE+ONEoMORE, ESCAPE+ONEoZERO): qua='n'
        else: qua=ctk[-1]
        if qua in (ZEROoMORE, ONEoZERO):
            ctk=ctk[:-1]
            if tix>next_abs_token and next_abs_token !=1:
                next_abs_token=tix
                while tokens[next_abs_token].endswith((ZEROoMORE, ONEoZERO)):
                    next_abs_token += 1
                    if next_abs_token==tkL: next_abs_token=1; break
        if nested:
            if ctk[0] == GROUPA:
                subtokens = listsplit(TokenListCache[strip_groupattris(ctk)])
                for subtk in subtokens:
                    subcmpi = try_index(sarg, cmpi, subtk, True)
                    if subcmpi: break
                else:
                    if qua in (ZEROoMORE, ONEoZERO) or ctk.startswith(EXTENSION+'!'): 
                        tix+=1; continue
                    else: return 0
                if ctk.startswith(EXTENSION+'='): tix+=1; continue
                elif ctk.startswith(EXTENSION+'!'): return 0
                if qua in (ZEROoMORE, ONEoZERO):
                    n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                    if n: return n            
                if qua == ZEROoMORE:
                    while subcmpi:
                        n = dam_greed(sarg, subcmpi, tokens, next_abs_token, nested)
                        if n: return n       
                        subcmpi = try_index(sarg, subcmpi, subtk, True)
                        if subcmpi: break
                if not ctk.startswith(EXTENSION+':'): xGroups.append(sarg[cmpi:subcmpi])
                cmpi=subcmpi; tix+=1; continue
# any group token done, back reference:
            if ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
                i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
                if sarg[cmpi:cmpi+lg]==captured:
                    if qua==ZEROoMORE:
                        while sarg[cmpi:cmpi+lg]==captured: 
                            n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                            if n: return n
                            cmpi+=lg
                        tix+=1; continue                           
                    elif qua == ONEoZERO: 
                        n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                        if n: return n                          
                    cmpi+=lg; tix+=1; continue
                elif qua in (ZEROoMORE, ONEoZERO): tix+=1
                else: return 0
# from here on match_simple_token is enough, tix points to a non-group token
        if qua==ONEoZERO:
            n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
            if n: return n  
            if match_simple_token(sarg[cmpi], ctk): cmpi+=1
        if qua==ZEROoMORE:
            while cmpi<len(sarg) and match_simple_token(sarg[cmpi], ctk): 
                n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                if n: return n  
                cmpi+=1
        else:
            if not match_simple_token(sarg[cmpi], ctk): return 0
            cmpi+=1
        tix+=1
    return cmpi

def xsearch(sarg, pattern, nested=False, start=0):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return ()
    
def xfinditer(sarg, pattern, nested=False, start=0):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    n=0
    while start<len(sarg):
        if n: start=n
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, start, tokens, nested)
        if n: yield (start, n)
        else: start+=1
    raise StopIteration()

def xmatch(sarg, pattern, nested=False):
    """ checks, whether sarg as the whole matches the pattern """
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    if nested: global xGroups; xGroups=[]
    n = try_index(sarg, 0, tokens, nested)
    if n==len(sarg): return True
    return False

def xsplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while xpair:
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, nested, xpair[1])
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, subst, nested=False):
    to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    if nested:
        for i in range(10): 
            try: subst=subst.replace(ESCAPE+str(i), xGroups[i])
            except IndexError: break               
    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    for xpair in reversed(to_replace): 
        s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 25 2010-06-10 21:51:07
+++ revision 26 2010-06-10 22:11:34
@@ -108,6 +108,12 @@
     resu.append(larg[ix:])
     return resu    
 
+def dam_greed(sarg, cmpi, tokens, next_abs_token, nested):
+    if nested and match_token(sarg, cmpi, tokens[next_abs_token]) or\
+    not nested and match_simple_token(sarg[cmpi], tokens[next_abs_token]):
+        return try_index(sarg, cmpi, tokens[next_abs_token:], nested)
+    return 0   
+                                
 def match_token(sarg, cmpi, token):
     if token[0] != GROUPA: return match_simple_token(sarg[cmpi], token)
     subtokens = listsplit(TokenListCache[strip_groupattris(token)])
@@ -115,21 +121,12 @@
         if try_index(sarg, cmpi, subtk, True): return True
     return False
 
-def dam_greed(sarg, cmpi, tokens, tix, nested):
-    tkL = len(tokens)
-    if not tix<tkL-1: return 0
-    i=1
-    while tokens[tix+i].endswith((ONEoZERO, ZEROoMORE)): 
-        i+=1
-        if i==tkL: i=1; break 
-        # we will simply test the next (quantified) token then
-    if nested and match_token(sarg, cmpi, tokens[tix+i]) or\
-    not nested and match_simple_token(sarg[cmpi], tokens[tix+i]):
-        return try_index(sarg, cmpi, tokens[tix+i:], nested)
-    return 0   
-
 def try_index(sarg, start, tokens, nested):
     cmpi=start; tix=0; tkL=len(tokens)
+    next_abs_token = 0
+    while tokens[next_abs_token].endswith((ZEROoMORE, ONEoZERO)):
+        next_abs_token += 1
+        if next_abs_token==tkL: next_abs_token=1; break    
     while tix<tkL:
         if cmpi==len(sarg):
             for s in tokens[tix:]:
@@ -138,7 +135,13 @@
         ctk = tokens[tix]
         if ctk in (ESCAPE+ZEROoMORE, ESCAPE+ONEoMORE, ESCAPE+ONEoZERO): qua='n'
         else: qua=ctk[-1]
-        if qua in (ZEROoMORE, ONEoZERO): ctk=ctk[:-1]
+        if qua in (ZEROoMORE, ONEoZERO):
+            ctk=ctk[:-1]
+            if tix>next_abs_token and next_abs_token !=1:
+                next_abs_token=tix
+                while tokens[next_abs_token].endswith((ZEROoMORE, ONEoZERO)):
+                    next_abs_token += 1
+                    if next_abs_token==tkL: next_abs_token=1; break
         if nested:
             if ctk[0] == GROUPA:
                 subtokens = listsplit(TokenListCache[strip_groupattris(ctk)])
@@ -152,11 +155,11 @@
                 if ctk.startswith(EXTENSION+'='): tix+=1; continue
                 elif ctk.startswith(EXTENSION+'!'): return 0
                 if qua in (ZEROoMORE, ONEoZERO):
-                    n = dam_greed(sarg, cmpi, tokens, tix, nested)
+                    n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                     if n: return n            
                 if qua == ZEROoMORE:
                     while subcmpi:
-                        n = dam_greed(sarg, subcmpi, tokens, tix, nested)
+                        n = dam_greed(sarg, subcmpi, tokens, next_abs_token, nested)
                         if n: return n       
                         subcmpi = try_index(sarg, subcmpi, subtk, True)
                         if subcmpi: break
@@ -168,24 +171,24 @@
                 if sarg[cmpi:cmpi+lg]==captured:
                     if qua==ZEROoMORE:
                         while sarg[cmpi:cmpi+lg]==captured: 
-                            n = dam_greed(sarg, cmpi, tokens, tix, nested)
+                            n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                             if n: return n
                             cmpi+=lg
                         tix+=1; continue                           
                     elif qua == ONEoZERO: 
-                        n = dam_greed(sarg, cmpi, tokens, tix, nested)
+                        n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                         if n: return n                          
                     cmpi+=lg; tix+=1; continue
                 elif qua in (ZEROoMORE, ONEoZERO): tix+=1
                 else: return 0
 # from here on match_simple_token is enough, tix points to a non-group token
         if qua==ONEoZERO:
-            n = dam_greed(sarg, cmpi, tokens, tix, nested)
+            n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
             if n: return n  
             if match_simple_token(sarg[cmpi], ctk): cmpi+=1
         if qua==ZEROoMORE:
             while cmpi<len(sarg) and match_simple_token(sarg[cmpi], ctk): 
-                n = dam_greed(sarg, cmpi, tokens, tix, nested)
+                n = dam_greed(sarg, cmpi, tokens, next_abs_token, nested)
                 if n: return n  
                 cmpi+=1
         else:

History