Welcome, guest | Sign In | My Account | Store | Cart
def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]=='^': booly = False; booln = True; i=2
    else: booly = True; booln = False; i=1
    while i < len(pattern):
        if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
            if ch== pattern[i+1]: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]=='-':
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+= 2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_token(ch, token):
    c0 = token[0]
    if c0=='[': return match_region(ch,token)
    elif c0=='.': return True
    elif c0=='\\':
        c1=token[1]
        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==c1)
    else: return (ch==c0)

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
xGroups = []

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    All with eventually trailing quantifiers
    """
    tokens = []; i=0
    while i < len(pattern):
        c = pattern[i]
        if c=='[':
            k = pattern.find(']', i)
            if k==-1: raise ValueError('Unmatched [ in '+pattern)
            while pattern[k-1] == '\\':
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched [ in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == '.':
            tokens.append('.'); i+=1
        elif c == '\\':
            if i<len(pattern)-1:
                tokens.append(pattern[i:i+2]); i+=2
            else:
                raise ValueError('Trailing backslash')
        elif nested and c=='(':
            resu = '('; k=i+1; lv=1; or_indicator=k
            while lv > 0:
                cc = pattern[k]
                if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == '(': lv+=1
                elif cc == ')': lv-=1
                resu+=cc; k+=1
            i=k
            if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
                resu+=pattern[i]; i+=1
                if resu not in TokenListCache.keys():
                    TokenListCache[resu]=parse_pattern(resu[1:-2], True)
            elif resu not in TokenListCache.keys():
                TokenListCache[resu]=parse_pattern(resu[1:-1], True)  
            tokens.append(resu)
            continue
        else:
            tokens.append(c); i+=1
        if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
            tokens[-1] += pattern[i]; i+=1
    TokenListCache[pattern]=tokens
    return tokens

def listsplit(larg, item):
    resu = []; ix = 0
    while ix<len(larg) and item in larg[ix:]: 
        ixx = larg.index(item); resu.append(larg[ix:ixx]); ix=ixx+1
    resu.append(larg[ix:])
    return resu    

def try_index(sarg, start, tokens, nested):
    compix = start; tix = 0; tkL = len(tokens)
    while tix<tkL:
        if compix == len(sarg):
            for s in tokens[tix:]:
                if s[-1] not in ('?','*'): return 0
            else: return compix
        ctk = tokens[tix]; qua=ctk[-1]
        if nested:
            if ctk[0] == '(':
                s = TokenListCache[ctk]
                if '|' in s: subtokens=listsplit(s, '|')
                else: subtokens=[s]
                for i in range(len(subtokens)):
                    subcompix = try_index(sarg, compix, subtokens[i], True)
                    if subcompix: break
                else:
                    if qua in ('?', '*'): tix+=1; continue
                    else: return 0                    
                if qua in ('*', '+'):
                    while subcompix:
                        for i in range(len(subtokens)):
                            subcompix = try_index(sarg, subcompix, subtokens[i], True)
                            if subcompix: break
                tix+=1
                xGroups.append(sarg[compix:subcompix])
                compix=subcompix
                continue
            if ctk[0]=='\\' and '0'<=ctk[1]<='9':
                i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
                if sarg[compix:compix+lg]==captured:
                    compix+=lg
                    if qua in ('+', '*'):
                        while sarg[compix:compix+lg]==captured: compix+=lg
                    tix+=1; continue
                elif qua in ('?','*'): tix+=1; continue
                else: return 0
        if qua=='?':
            if match_token(sarg[compix], ctk): compix+=1
            tix+=1; continue
        if qua=='+':
            if not match_token(sarg[compix], ctk): return 0
            compix+=1
        if qua in ('*','+'):
            if tix<tkL-1:
                while compix<len(sarg) and match_token(sarg[compix], ctk) and\
                            not match_token(sarg[compix], tokens[tix+1]): 
                    compix+=1
            else:
                while compix<len(sarg) and match_token(sarg[compix], ctk):
                    compix+=1
            tix+=1  
        else:
            if not match_token(sarg[compix], ctk): return 0
            compix+=1; tix+=1
    return compix

def xsearch(sarg, pattern, nested=False, start=0):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return ()
    
def xmatch(sarg, pattern, nested=False):
    """ checks, whether sarg as the whole matches the pattern """
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    n = try_index(sarg, 0, tokens, nested)
    if n==len(sarg): return True
    return False

def xsplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while xpair:
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, nested, xpair[1])
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, subst, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nestedxpair[1], xpair[1])
    if nested:
        for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    for xpair in reversed(to_replace): 
        s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 9 2010-06-03 22:48:06
+++ revision 10 2010-06-04 08:39:22
@@ -67,9 +67,9 @@
             if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
                 resu+=pattern[i]; i+=1
                 if resu not in TokenListCache.keys():
-                    TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
+                    TokenListCache[resu]=parse_pattern(resu[1:-2], True)
             elif resu not in TokenListCache.keys():
-                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)  
+                TokenListCache[resu]=parse_pattern(resu[1:-1], True)  
             tokens.append(resu)
             continue
         else:
@@ -124,8 +124,8 @@
                 elif qua in ('?','*'): tix+=1; continue
                 else: return 0
         if qua=='?':
-            if match_token(sarg[compix], ctk): tix+=1; compix+=1
-            continue
+            if match_token(sarg[compix], ctk): compix+=1
+            tix+=1; continue
         if qua=='+':
             if not match_token(sarg[compix], ctk): return 0
             compix+=1
@@ -143,7 +143,7 @@
             compix+=1; tix+=1
     return compix
 
-def xsearch(sarg, pattern, start=0, nested=False):
+def xsearch(sarg, pattern, nested=False, start=0):
     try: tokens = TokenListCache[pattern]
     except: tokens = parse_pattern(pattern, nested)
     argix=start
@@ -167,22 +167,22 @@
     while xpair:
         resu.append(sarg[:xpair[0]])
         residue = xpair[1]
-        xpair = xsearch(sarg, pattern, xpair[1], nested)
+        xpair = xsearch(sarg, pattern, nested, xpair[1])
     return resu+sarg[residue:]
     
 def xreplace(sarg, pattern, subst, nested=False):
-    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
+    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested)
     while xpair:
-        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
+        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nestedxpair[1], xpair[1])
     if nested:
         for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
     for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
     return s
 
 def xfuncreplace(sarg, pattern, f, nested=False):
-    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
+    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested)
     while xpair:
-        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
+        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
     for xpair in reversed(to_replace): 
         s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
     return s

History