Welcome, guest | Sign In | My Account | Store | Cart
ANYCHAR = '.'; ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
GROUPA = '(' ; GROUPO = ')'; ALTERNATIVE = '|'
PERHAPS = '?' ; STAR = '*'; JUST_ONE_and_STAR = '+'
EXTENSION = '(?'; SKIPSTORE = '?:'

def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]==COMPLEMENT: booly=False; booln=True; i=2
    else: booly=True; booln=False; i=1
    while i < len(pattern):
        if pattern[i]==ESCAPE:
            if pattern[i+1]==ch: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]==RANGE:
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+=2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_simple_token(ch, token):
    c0=token[0]
    if c0==ESCAPE:
        c1=token[1]
        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==c1)    
    elif c0==REGIONA: return match_region(ch,token)
    elif c0==ANYCHAR: return True
    else: return (ch==c0)

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
xGroups = []

def strip_groupattris(s):
    for c in (':', '=', '!'):
        if s.startswith(EXTENSION+c): s=s[3:]; break
    else: s=s[1:]
    if not s.endswith(')'): return s[:-2]
    return s[:-1]

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    All together with eventually trailing quantifiers
    """
    if pattern in TokenListCache.keys(): return TokenListCache[pattern]
    tokens=[]; i=0; pL = len(pattern)
    while i < pL: 
        c = pattern[i]
        if c==REGIONA:
            k = pattern.find(REGIONO, i)
            if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
            while pattern[k-1] == ESCAPE:
                k = pattern.find(REGIONO, k+1)
                if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
            tokens.append(pattern[i:k+1]); i=k+1
        elif c == ANYCHAR: tokens.append(ANYCHAR); i+=1
        elif c == ESCAPE:
            if i<pL-1: tokens.append(pattern[i:i+2]); i+=2
            else: raise ValueError('Trailing '+ESCAPE)
        elif nested and c==GROUPA:
            resu = GROUPA; k=i+1; lv=1
            while lv > 0:
                cc = pattern[k]
                if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == GROUPA: lv+=1 
                elif cc == GROUPO: lv-=1
                resu+=cc; k+=1
            tokens.append(resu); i=k; kore=strip_groupattris(resu)
            if resu not in TokenListCache.keys():
                TokenListCache[resu] = []   
                # groups are parsed to lists of token lists, each an alternative from '|'
                if kore[0] != GROUPA:
                    for s in kore.split(ALTERNATIVE):
                        TokenListCache[resu].append(parse_pattern(s, True))
                else:
                    TokenListCache[resu].append(parse_pattern(kore, True))
        else: tokens.append(c); i+=1
        if i<pL:
            if pattern[i]==PERHAPS: tokens[-1]=(tokens[-1],PERHAPS); i+=1
            elif pattern[i]==STAR: tokens[-1]=(tokens[-1],STAR); i+=1
            elif pattern[i]==JUST_ONE_and_STAR: 
                tokens.append((tokens[-1],STAR)); tokens[-2]=(tokens[-2],None); i+=1
            else: tokens[-1] = (tokens[-1],None)
        else: tokens[-1] = (tokens[-1],None) 
    TokenListCache[pattern]=tokens; return tokens

def probe_last_token(sarg, start, argtk, nested):
    """ 
    Optimization, not only by removed greed and look-ahead tests of try_index(),
    but also by several removed checks of tix in try_index()
    Can be read as an explanation of try_index() - it is an excerpt 
    """
    compix=start; matchix=0; L=len(sarg)
    if compix==L:
        if not argtk[1]: return 0
        else: return L
    ctk, qua = argtk
    if nested and ctk[0] == GROUPA:
        for subtk in TokenListCache[ctk]:
            matchix = try_index(sarg, compix, subtk, True)
            if matchix: break
        if matchix:
            if ctk[1:3]!=SKIPSTORE: xGroups.append(sarg[compix:matchix])
            if qua==STAR:
                while matchix and matchix<L:  
                    for subtk in TokenListCache[ctk]:
                        evres = try_index(sarg, matchix, subtk, True)
                        if evres: matchix=evres; break
                    else: break
    elif nested and ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
        i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
        if sarg[compix:compix+lg]==captured:
            matchix=compix+lg
            if qua==STAR: 
                while sarg[matchix:matchix+lg]==captured and matchix<L: matchix+=lg
    elif match_simple_token(sarg[compix], ctk):
        matchix=compix+1
        if qua==STAR:
            while match_simple_token(sarg[matchix], ctk) and matchix<L: matchix+=1
    if qua: return max(compix, matchix) 
    else: return matchix   

def try_index(sarg, start, tokens, nested):
    compix=start; tkL=len(tokens); L=len(sarg)
    for tix in range(tkL):
        if compix==L:
            for pair in tokens[tix:]: 
                if not pair[1]: return 0
            else: return compix
        if tix==tkL-1: return probe_last_token(sarg, compix, tokens[-1], nested)
        ctk, qua = tokens[tix]
        if qua:
            n = try_index(sarg, compix, tokens[tix+1:], nested)
            if n: return n
        matchix=0
        if nested and ctk[0] == GROUPA:
            for subtk in TokenListCache[ctk]:
                matchix = try_index(sarg, compix, subtk, True)
                if matchix: break
            if matchix:
                if ctk[1:3]!=SKIPSTORE: xGroups.append(sarg[compix:matchix])             
                if ctk.startswith(EXTENSION+'='): continue
                elif ctk.startswith(EXTENSION+'!'): return 0         
                if qua==STAR:
                    while matchix and matchix<L:
                        n = try_index(sarg, matchix, tokens[tix+1:], nested)
                        if n: return n
                        for subtk in TokenListCache[ctk]:
                            evres = try_index(sarg, matchix, subtk, True)
                            if evres: matchix=evres; break
                        else: return 0
            else:
                if ctk.startswith(EXTENSION+'='): return 0
                elif ctk.startswith(EXTENSION+'!'): continue         
        elif nested and ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
            i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
            if sarg[compix:compix+lg]==captured:
                matchix=compix+lg
                if qua==STAR:
                    while matchix<L:
                        n = try_index(sarg, matchix, tokens[tix+1:], True)
                        if n: return n
                        if sarg[matchix:matchix+lg]==captured: matchix+=lg
                        else: return 0
        elif match_simple_token(sarg[compix], ctk):
            matchix=compix+1
            if qua==STAR:
                while matchix<L:
                    n = try_index(sarg, matchix, tokens[tix+1:], nested)
                    if n: return n
                    if match_simple_token(sarg[matchix], ctk): matchix+=1
                    else: return 0
        if not matchix: return 0  # with quantifiers the next token is already checked
        compix=matchix
    return compix

def xsearch(sarg, pattern, nested=False, start=0):
    tokens = parse_pattern(pattern, nested); argi=start
    while argi<len(sarg):
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, argi, tokens, nested)
        if n: return (argi, n)
        argi+=1
    return ()
    
def xfinditer(sarg, pattern, nested=False, start=0):
    tokens = parse_pattern(pattern, nested); n=0
    while start<len(sarg):
        if n: start=n
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, start, tokens, nested)
        if n: yield (start, n)
        else: start+=1
    raise StopIteration()

def xmatch(sarg, pattern, nested=False):
    """ checks, whether sarg as the whole matches the pattern """
    tokens = parse_pattern(pattern, nested)
    if nested: global xGroups; xGroups=[]
    n = try_index(sarg, 0, tokens, nested)
    if n==len(sarg): return True
    return False

def xsplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while xpair:
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, nested, xpair[1])
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, subst, nested=False):
    to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    if nested:
        for i in range(10): 
            try: subst=subst.replace(ESCAPE+str(i), xGroups[i])
            except IndexError: break               
    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    for xpair in reversed(to_replace): 
        s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 35 2010-06-15 13:01:13
+++ revision 36 2010-06-16 13:20:54
@@ -1,7 +1,3 @@
-# WARNING: Former revisions were not correct in damming greed of the star, they didn't stop
-# "[^b]*b*" in xmatch("aaba", "a[^b]*b*aba") to eat up the second and third character 
-# returning mismatch. And also greed of ? is existing, which wasn't dammed at all.
-
 ANYCHAR = '.'; ESCAPE = '\\'
 REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
 GROUPA = '(' ; GROUPO = ')'; ALTERNATIVE = '|'
@@ -100,101 +96,91 @@
     """ 
     Optimization, not only by removed greed and look-ahead tests of try_index(),
     but also by several removed checks of tix in try_index()
-    Can be read as an explanation of try_index() - it is an excerpt
+    Can be read as an explanation of try_index() - it is an excerpt 
     """
-    cmpi=start; L=len(sarg)
-    if cmpi==L:
+    compix=start; matchix=0; L=len(sarg)
+    if compix==L:
         if not argtk[1]: return 0
         else: return L
     ctk, qua = argtk
-    if nested:
-        if ctk[0] == GROUPA:
-            for subtk in TokenListCache[ctk]:
-                subcmpi = try_index(sarg, cmpi, subtk, True)
-                if subcmpi: break
-            else: return 0
+    if nested and ctk[0] == GROUPA:
+        for subtk in TokenListCache[ctk]:
+            matchix = try_index(sarg, compix, subtk, True)
+            if matchix: break
+        if matchix:
+            if ctk[1:3]!=SKIPSTORE: xGroups.append(sarg[compix:matchix])
             if qua==STAR:
-                while subcmpi and subcmpi<L:  
+                while matchix and matchix<L:  
                     for subtk in TokenListCache[ctk]:
-                        evres = try_index(sarg, subcmpi, subtk, True)
-                        if evres: subcmpi=evres; break
+                        evres = try_index(sarg, matchix, subtk, True)
+                        if evres: matchix=evres; break
                     else: break
-            if ctk[1:3]!=SKIPSTORE: xGroups.append(sarg[cmpi:subcmpi])
-            return subcmpi         # any group token done, back reference:
-        elif ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
-            i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
-            if sarg[cmpi:cmpi+lg]==captured:
-                cmpi+=lg
-                if qua==STAR: 
-                    while sarg[cmpi:cmpi+lg]==captured and cmpi<L: cmpi+=lg
-                return cmpi
-            elif not qua: return 0     # from here on tix points to a simple token:
-    if match_simple_token(sarg[cmpi], ctk):
-        cmpi+=1
+    elif nested and ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
+        i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
+        if sarg[compix:compix+lg]==captured:
+            matchix=compix+lg
+            if qua==STAR: 
+                while sarg[matchix:matchix+lg]==captured and matchix<L: matchix+=lg
+    elif match_simple_token(sarg[compix], ctk):
+        matchix=compix+1
         if qua==STAR:
-            while match_simple_token(sarg[cmpi], ctk) and cmpi<L: cmpi+=1
-    elif not qua: return 0
-    return cmpi
+            while match_simple_token(sarg[matchix], ctk) and matchix<L: matchix+=1
+    if qua: return max(compix, matchix) 
+    else: return matchix   
 
 def try_index(sarg, start, tokens, nested):
-    cmpi=start; tkL=len(tokens); L=len(sarg)
+    compix=start; tkL=len(tokens); L=len(sarg)
     for tix in range(tkL):
-        if cmpi==L:
+        if compix==L:
             for pair in tokens[tix:]: 
                 if not pair[1]: return 0
-            else: return cmpi
-        if tix==tkL-1: return probe_last_token(sarg, cmpi, tokens[-1], nested)
+            else: return compix
+        if tix==tkL-1: return probe_last_token(sarg, compix, tokens[-1], nested)
         ctk, qua = tokens[tix]
         if qua:
-            n = try_index(sarg, cmpi, tokens[tix+1:], nested)
+            n = try_index(sarg, compix, tokens[tix+1:], nested)
             if n: return n
-            else: greedchecked = True
-            # nearly all STAR-loops return, some from recursion
-            # this is for an eventual first match of their token 
-            # and not matching tokens with PERHAPS
-        if nested:
-            if ctk[0] == GROUPA:
-                for subtk in TokenListCache[ctk]:
-                    subcmpi = try_index(sarg, cmpi, subtk, True)
-                    if subcmpi: break
-                else:
-                    if qua or ctk.startswith(EXTENSION+'!'): continue
-                    else: return 0
+        matchix=0
+        if nested and ctk[0] == GROUPA:
+            for subtk in TokenListCache[ctk]:
+                matchix = try_index(sarg, compix, subtk, True)
+                if matchix: break
+            if matchix:
+                if ctk[1:3]!=SKIPSTORE: xGroups.append(sarg[compix:matchix])             
                 if ctk.startswith(EXTENSION+'='): continue
                 elif ctk.startswith(EXTENSION+'!'): return 0         
                 if qua==STAR:
-                    while subcmpi<L:
-                        n = try_index(sarg, subcmpi, tokens[tix+1:], nested)
+                    while matchix and matchix<L:
+                        n = try_index(sarg, matchix, tokens[tix+1:], nested)
                         if n: return n
                         for subtk in TokenListCache[ctk]:
-                            evres = try_index(sarg, subcmpi, subtk, True)
-                            if evres: subcmpi=evres; break
-                        else: return 0   # with no match here the next token - 
-                                         # pointed to by n above - must have matched 
-                if ctk[1:3]!=SKIPSTORE: xGroups.append(sarg[cmpi:subcmpi])
-                cmpi=subcmpi; continue
-            elif ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
-                i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
-                if sarg[cmpi:cmpi+lg]==captured:
-                    cmpi+=lg
-                    if qua==STAR:
-                        while cmpi<L:
-                            n = try_index(sarg, cmpi, tokens[tix+1:], True)
-                            if n: return n
-                            if sarg[cmpi:cmpi+lg]==captured: cmpi+=lg
-                            else: return 0
-                elif not qua: return 0
-                continue
-        if match_simple_token(sarg[cmpi], ctk):
-            cmpi+=1
+                            evres = try_index(sarg, matchix, subtk, True)
+                            if evres: matchix=evres; break
+                        else: return 0
+            else:
+                if ctk.startswith(EXTENSION+'='): return 0
+                elif ctk.startswith(EXTENSION+'!'): continue         
+        elif nested and ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
+            i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
+            if sarg[compix:compix+lg]==captured:
+                matchix=compix+lg
+                if qua==STAR:
+                    while matchix<L:
+                        n = try_index(sarg, matchix, tokens[tix+1:], True)
+                        if n: return n
+                        if sarg[matchix:matchix+lg]==captured: matchix+=lg
+                        else: return 0
+        elif match_simple_token(sarg[compix], ctk):
+            matchix=compix+1
             if qua==STAR:
-                while cmpi<L:
-                    n = try_index(sarg, cmpi, tokens[tix+1:], nested)
+                while matchix<L:
+                    n = try_index(sarg, matchix, tokens[tix+1:], nested)
                     if n: return n
-                    if match_simple_token(sarg[cmpi], ctk): cmpi+=1
+                    if match_simple_token(sarg[matchix], ctk): matchix+=1
                     else: return 0
-        elif not qua or qua==PERHAPS and greedchecked: return 0
-    return cmpi
+        if not matchix: return 0  # with quantifiers the next token is already checked
+        compix=matchix
+    return compix
 
 def xsearch(sarg, pattern, nested=False, start=0):
     tokens = parse_pattern(pattern, nested); argi=start

History