Welcome, guest | Sign In | My Account | Store | Cart
def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]=='^': booly = False; booln = True; i=2
    else: booly = True; booln = False; i=1
    while i < len(pattern):
        if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
            if ch== pattern[i+1]: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]=='-':
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+= 2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_token(ch, token):
    c0 = token[0]
    if c0=='[': return match_region(ch,token)
    elif c0=='.': return True
    elif c0=='\\':
        c1=token[1]
        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==c1)
    else: return (ch==c0)

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
xGroups = []

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    All with eventually trailing quantifiers
    """
    tokens = []; i=0
    while i < len(pattern):
        c = pattern[i]
        if c=='[':
            k = pattern.find(']', i)
            if k==-1: raise ValueError('Unmatched [ in '+pattern)
            while pattern[k-1] == '\\':
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched [ in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == '.':
            tokens.append('.'); i+=1
        elif c == '\\':
            if i<len(pattern)-1:
                tokens.append(pattern[i:i+2]); i+=2
            else:
                raise ValueError('Bad successor of a backslash')
        elif nested and c=='(':
            resu = '('; k=i+1; lv=1 
            while lv > 0:
                cc = pattern[k]
                if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == '(': lv+=1
                elif cc == ')': lv-=1
                resu+=cc; k+=1
            i=k
            if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
                resu+=pattern[i]; i+=1
                if resu not in TokenListCache.keys():
                    TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
            elif resu not in TokenListCache.keys():
                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)               
            tokens.append(resu)
            continue
        else:
            tokens.append(c); i+=1
        if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
            tokens[-1] += pattern[i]; i+=1
    TokenListCache[pattern]=tokens
    return tokens

def try_index(sarg, start, tokens, nested):
    compix = start; tix = 0; tkL = len(tokens)
    while tix<tkL:
        if compix == len(sarg): return 0
        ctk = tokens[tix]; qua=ctk[-1]
        if nested :
            if ctk[0] == '(':
                subtokens = TokenListCache[ctk]
                subcompix = try_index(sarg, compix, subtokens, True)
                bak = subcompix
                if not subcompix:
                    if qua in ('?', '*'): tix+=1; continue
                    else: return 0
                elif qua in ('*', '+'):
                    while subcompix:
                        bak = subcompix
                        subcompix = try_index(sarg, subcompix, subtokens, True)
                tix+=1
                xGroups.append(sarg[compix:bak])
                compix=bak
                continue
            if ctk[0]=='\\' and '0'<=ctk[1]<='9':
                i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
                if sarg[compix:compix+lg]==captured:
                    compix+=lg
                    if qua in ('+', '*'):
                        while sarg[compix:compix+lg]==captured: compix+=lg
                    tix+=1; continue
                elif qua in ('?','*'): tix+=1; continue
                else: return 0
        if qua=='?':
            if match_token(sarg[compix], ctk): tix+=1; compix+=1
            continue
        if qua=='+':
            if not match_token(sarg[compix], ctk): return 0
            compix+=1
        if qua in ('*','+'):
            if tix<tkL-1:
                while compix<len(sarg) and match_token(sarg[compix], ctk) and\
                            not match_token(sarg[compix], tokens[tix+1]): 
                    compix+=1
            else:
                while compix<len(sarg) and match_token(sarg[compix], ctk):
                    compix+=1
            tix+=1  
        else:
            if not match_token(sarg[compix], ctk): return 0
            compix+=1; tix+=1
    return compix

def xsearch(sarg, pattern, start=0, nested=False):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return (0,0)

def xplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while (xpair[0]):
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, xpair[1], nested)
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, subst, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
    while (xpair[0]):
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
    if nested:
        for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
    while (xpair[0]):
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
    for xpair in reversed(to_replace): 
        s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 5 2010-06-02 09:59:56
+++ revision 6 2010-06-02 19:04:12
@@ -14,17 +14,18 @@
     return booln
 
 def match_token(ch, token):
-    if token.startswith('['): return match_region(ch,token)
-    elif token.startswith('\\'):
-        tkc=token[1]
-        if tkc=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
-        elif tkc=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
-        else: return (ch==tkc)
-    elif token=='.': return True
-    else: return (ch==token[0])
+    c0 = token[0]
+    if c0=='[': return match_region(ch,token)
+    elif c0=='.': return True
+    elif c0=='\\':
+        c1=token[1]
+        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
+        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
+        else: return (ch==c1)
+    else: return (ch==c0)
 
 TokenListCache = {}  # essential with nesting, otherwise only here for optimization
-to_escape =  ('.','*','+','?','\\','[',']'); esc_brackets = to_escape + ('(', ')')
+xGroups = []
 
 def parse_pattern(pattern, nested):
     """ 
@@ -34,12 +35,9 @@
     3. \\ together with the escaped character
     4. periods
     5. simple characters
-    6. quantifiers, only following tokens 1-5
+    All with eventually trailing quantifiers
     """
-    if nested: ESC = esc_brackets
-    else: ESC = to_escape
-    tokens = []
-    i = 0
+    tokens = []; i=0
     while i < len(pattern):
         c = pattern[i]
         if c=='[':
@@ -53,7 +51,7 @@
         elif c == '.':
             tokens.append('.'); i+=1
         elif c == '\\':
-            if i<len(pattern)-1 and pattern[i+1] in  ESC:
+            if i<len(pattern)-1:
                 tokens.append(pattern[i:i+2]); i+=2
             else:
                 raise ValueError('Bad successor of a backslash')
@@ -65,16 +63,19 @@
                 if cc == '(': lv+=1
                 elif cc == ')': lv-=1
                 resu+=cc; k+=1
-            if resu not in TokenListCache.keys():
-                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
+            i=k
+            if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
+                resu+=pattern[i]; i+=1
+                if resu not in TokenListCache.keys():
+                    TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
+            elif resu not in TokenListCache.keys():
+                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)               
             tokens.append(resu)
-            if k<len(pattern) and pattern[k] in ('*', '+', '?'):
-                tokens.append(pattern[k]); k+=1
-            i = k+1
+            continue
         else:
             tokens.append(c); i+=1
         if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
-            tokens.append(pattern[i]); i+=1
+            tokens[-1] += pattern[i]; i+=1
     TokenListCache[pattern]=tokens
     return tokens
 
@@ -82,40 +83,50 @@
     compix = start; tix = 0; tkL = len(tokens)
     while tix<tkL:
         if compix == len(sarg): return 0
-        ctk = tokens[tix]
-        if nested and ctk[0] == '(':
-            subtokens = TokenListCache[ctk]
-            subcompix = try_index(sarg, compix, subtokens, True)
-            c = 'n'
-            if tix<tkL-1: c = tokens[tix+1]
-            if not subcompix:
-                if c in ('?', '*'): tix+=2; compix=subcompix; continue
+        ctk = tokens[tix]; qua=ctk[-1]
+        if nested :
+            if ctk[0] == '(':
+                subtokens = TokenListCache[ctk]
+                subcompix = try_index(sarg, compix, subtokens, True)
+                bak = subcompix
+                if not subcompix:
+                    if qua in ('?', '*'): tix+=1; continue
+                    else: return 0
+                elif qua in ('*', '+'):
+                    while subcompix:
+                        bak = subcompix
+                        subcompix = try_index(sarg, subcompix, subtokens, True)
+                tix+=1
+                xGroups.append(sarg[compix:bak])
+                compix=bak
+                continue
+            if ctk[0]=='\\' and '0'<=ctk[1]<='9':
+                i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
+                if sarg[compix:compix+lg]==captured:
+                    compix+=lg
+                    if qua in ('+', '*'):
+                        while sarg[compix:compix+lg]==captured: compix+=lg
+                    tix+=1; continue
+                elif qua in ('?','*'): tix+=1; continue
                 else: return 0
-            elif c in ('*', '+'):
-                while subcompix:
-                    compix = subcompix
-                    subcompix = try_index(sarg, compix, subtokens, nested)
-                tix+=2; continue
-        qua = 'n'
-        if tix<tkL-1: qua = tokens[tix+1]
         if qua=='?':
-            if match_token(sarg[compix], ctk): tix+=2; compix+=1
+            if match_token(sarg[compix], ctk): tix+=1; compix+=1
             continue
         if qua=='+':
             if not match_token(sarg[compix], ctk): return 0
             compix+=1
         if qua in ('*','+'):
-            if tix<tkL-2:
+            if tix<tkL-1:
                 while compix<len(sarg) and match_token(sarg[compix], ctk) and\
-                            not match_token(sarg[compix], tokens[tix+2]): 
+                            not match_token(sarg[compix], tokens[tix+1]): 
                     compix+=1
             else:
                 while compix<len(sarg) and match_token(sarg[compix], ctk):
                     compix+=1
-            tix+=2                    
+            tix+=1  
         else:
             if not match_token(sarg[compix], ctk): return 0
-            else: compix+=1; tix+=1
+            compix+=1; tix+=1
     return compix
 
 def xsearch(sarg, pattern, start=0, nested=False):
@@ -123,6 +134,7 @@
     except: tokens = parse_pattern(pattern, nested)
     argix=start
     while argix<len(sarg):
+        if nested: global xGroups; xGroups=[]
         n = try_index(sarg, argix, tokens, nested)
         if n: return (argix, n)
         argix += 1
@@ -136,18 +148,19 @@
         xpair = xsearch(sarg, pattern, xpair[1], nested)
     return resu+sarg[residue:]
     
-def xreplace(sarg, pattern, f, nested=False):
+def xreplace(sarg, pattern, subst, nested=False):
     to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
     while (xpair[0]):
         to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
-    for xpair in reversed(to_replace):
-        s = s[:xpair[0]]+f+s[xpair[1]:]        
+    if nested:
+        for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
+    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
     return s
 
 def xfuncreplace(sarg, pattern, f, nested=False):
     to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
     while (xpair[0]):
         to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
-    for xpair in reversed(to_replace):
+    for xpair in reversed(to_replace): 
         s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
     return s

History