Welcome, guest | Sign In | My Account | Store | Cart
# You can edit these constants. You might want to avoid backslash inflation 
# or to avoid introduction of still more brackets when analysing LISP-code incidentally

ANYCHAR = '.'
ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
GROUPA = '(' ; GROUPO = ')'
ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
ALTERNATIVE = '|'
EXTENSION = '(?'

def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]==COMPLEMENT: booly = False; booln = True; i=2
    else: booly = True; booln = False; i=1
    while i < len(pattern):
        if pattern[i] == ESCAPE and pattern[i+1] in (REGIONO, ESCAPE, RANGE):
            if ch== pattern[i+1]: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]==RANGE:
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+= 2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_simple_token(ch, token):
    c0 = token[0]
    if c0==REGIONA: return match_region(ch,token)
    elif c0==ANYCHAR: return True
    elif c0==ESCAPE:
        c1=token[1]
        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==c1)
    else: return (ch==c0)

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
xGroups = []

def parse_pattern(pattern, nested, by_recursion = False):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    All with eventually trailing quantifiers
    """
    tokens = []; i=0
    while i < len(pattern):
        c = pattern[i]
        if c==REGIONA:
            k = pattern.find(REGIONO, i)
            if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
            while pattern[k-1] == ESCAPE:
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == '.':
            tokens.append(ANYCHAR); i+=1
        elif c == ESCAPE:
            if i<len(pattern)-1:
                tokens.append(pattern[i:i+2]); i+=2
            else:
                raise ValueError('Trailing '+ESCAPE)
        elif nested and c==GROUPA:
            resu = GROUPA; k=i+1; lv=1
            while lv > 0:
                cc = pattern[k]
                if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == GROUPA: lv+=1
                elif cc == GROUPO: lv-=1
                resu+=cc; k+=1
            i=k; corro=0; corra=0
            if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE): 
                resu+=pattern[i]; i+=1; corro=1
            for s in (':','!','='): 
                if resu.startswith(EXTENSION+s): corra = 1+len(s)
            if resu not in TokenListCache.keys():
                TokenListCache[resu]=parse_pattern(resu[1+corra:-(1+corro)], True, True)
            tokens.append(resu)
            continue
        else:
            tokens.append(c); i+=1
        if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE): 
            tokens[-1] += pattern[i]; i+=1
    if pattern not in TokenListCache.keys() and not by_recursion: 
        TokenListCache[pattern]=tokens
    return tokens

def listsplit(larg, item):
    resu = []; ix = 0
    while ix<len(larg) and item in larg[ix:]: 
        ixx = larg.index(item); resu.append(larg[ix:ixx]); ix=ixx+1
    resu.append(larg[ix:])
    return resu    

def match_token(sarg, compix, token):
    if not token[0]=='(': return match_simple_token(sarg[compix], token)
    subtokens = listsplit(TokenListCache[token], ALTERNATIVE)
    for i in range(len(subtokens)):
        subcompix = try_index(sarg, compix, subtokens[i], True)
        if subcompix: return True
    return False 
    
def try_index(sarg, start, tokens, nested):
    compix = start; tix = 0; tkL = len(tokens)
    while tix<tkL:
        if compix == len(sarg):
            for s in tokens[tix:]:
                if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
            else: return compix
        ctk = tokens[tix]; qua=ctk[-1]
        if nested:
            if ctk[0] == GROUPA:
                subtokens = listsplit(TokenListCache[ctk], ALTERNATIVE)
                for i in range(len(subtokens)):
                    subcompix = try_index(sarg, compix, subtokens[i], True)
                    if subcompix: break
                else:
                    if qua in (ONEoZERO, ZEROoMORE) or ctk.startswith(EXTENSION+'!'): 
                        tix+=1; continue
                    else: return 0
                if ctk.startswith(EXTENSION+'='): tix+=1; continue
                elif ctk.startswith(EXTENSION+'!'): return 0   
                if qua in (ZEROoMORE, ONEoMORE):
                    while subcompix and not match_token(sarg, compix, tokens[tix+1]):
                        for i in range(len(subtokens)):
                            subcompix = try_index(sarg, subcompix, subtokens[i], True)
                            if subcompix: break
                if not ctk.startswith( EXTENSION+':' ): 
                    xGroups.append(sarg[compix:subcompix])
                compix=subcompix; tix+=1; continue
            if ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
                i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
                if sarg[compix:compix+lg]==captured:
                    compix+=lg
                    if qua in (ZEROoMORE, ONEoMORE):
                        while sarg[compix:compix+lg]==captured: compix+=lg
                    tix+=1; continue
                elif qua in (ONEoZERO, ZEROoMORE): tix+=1; continue
                else: return 0
        if qua==ONEoZERO:
            if match_simple_token(sarg[compix], ctk): compix+=1
            tix+=1; continue
        if qua==ONEoMORE:
            if not match_simple_token(sarg[compix], ctk): return 0
            compix+=1
        if qua in (ZEROoMORE, ONEoMORE):
            if tix<tkL-1:
                while compix<len(sarg) and match_simple_token(sarg[compix], ctk) and\
                            not match_token(sarg, compix, tokens[tix+1]): 
                    compix+=1
            else:
                while compix<len(sarg) and match_simple_token(sarg[compix], ctk):
                    compix+=1
            tix+=1  
        else:
            if not match_simple_token(sarg[compix], ctk): return 0
            compix+=1; tix+=1
    return compix

def xsearch(sarg, pattern, nested=False, start=0):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return ()
    
def xfinditer(sarg, pattern, nested=False, start=-1):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    print(TokenListCache)
    while start<len(sarg):
        start += 1
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, start, tokens, nested)
        if n: yield (start, n)
    raise StopIteration()

def xmatch(sarg, pattern, nested=False):
    """ checks, whether sarg as the whole matches the pattern """
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    n = try_index(sarg, 0, tokens, nested)
    if n==len(sarg): return True
    return False

def xsplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while xpair:
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, nested, xpair[1])
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, subst, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    if nested:
        for i in range(10): subst=subst.replace(ESCAPE+str(i), xGroups[i])
    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested)
    while xpair:
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
    for xpair in reversed(to_replace): 
        s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 12 2010-06-04 12:58:51
+++ revision 13 2010-06-05 00:31:10
@@ -1,23 +1,34 @@
+# You can edit these constants. You might want to avoid backslash inflation 
+# or to avoid introduction of still more brackets when analysing LISP-code incidentally
+
+ANYCHAR = '.'
+ESCAPE = '\\'
+REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
+GROUPA = '(' ; GROUPO = ')'
+ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
+ALTERNATIVE = '|'
+EXTENSION = '(?'
+
 def match_region(ch, pattern):
     """ region: a list of comparation chars and ranges in [] or [^]"""
-    if pattern[1]=='^': booly = False; booln = True; i=2
+    if pattern[1]==COMPLEMENT: booly = False; booln = True; i=2
     else: booly = True; booln = False; i=1
     while i < len(pattern):
-        if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
+        if pattern[i] == ESCAPE and pattern[i+1] in (REGIONO, ESCAPE, RANGE):
             if ch== pattern[i+1]: return booly
             else: i+=2
-        elif i<len(pattern)-2 and pattern[i+1]=='-':
+        elif i<len(pattern)-2 and pattern[i+1]==RANGE:
             if pattern[i]<=ch<=pattern[i+2]: return booly
             else: i+= 2
         elif pattern[i]==ch: return booly
         else: i+=1
     return booln
 
-def match_token(ch, token):
+def match_simple_token(ch, token):
     c0 = token[0]
-    if c0=='[': return match_region(ch,token)
-    elif c0=='.': return True
-    elif c0=='\\':
+    if c0==REGIONA: return match_region(ch,token)
+    elif c0==ANYCHAR: return True
+    elif c0==ESCAPE:
         c1=token[1]
         if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
         elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
@@ -27,7 +38,7 @@
 TokenListCache = {}  # essential with nesting, otherwise only here for optimization
 xGroups = []
 
-def parse_pattern(pattern, nested):
+def parse_pattern(pattern, nested, by_recursion = False):
     """ 
     tokens are: 
     1. patterns included in brackets (parsing is recursive)
@@ -40,42 +51,44 @@
     tokens = []; i=0
     while i < len(pattern):
         c = pattern[i]
-        if c=='[':
-            k = pattern.find(']', i)
-            if k==-1: raise ValueError('Unmatched [ in '+pattern)
-            while pattern[k-1] == '\\':
+        if c==REGIONA:
+            k = pattern.find(REGIONO, i)
+            if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
+            while pattern[k-1] == ESCAPE:
                 k = pattern.find(']', k+1)
-                if k==-1: raise ValueError('Unmatched [ in '+pattern)
+                if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
             tokens.append(pattern[i:k+1])
             i=k+1
         elif c == '.':
-            tokens.append('.'); i+=1
-        elif c == '\\':
+            tokens.append(ANYCHAR); i+=1
+        elif c == ESCAPE:
             if i<len(pattern)-1:
                 tokens.append(pattern[i:i+2]); i+=2
             else:
-                raise ValueError('Trailing backslash')
-        elif nested and c=='(':
-            resu = '('; k=i+1; lv=1
+                raise ValueError('Trailing '+ESCAPE)
+        elif nested and c==GROUPA:
+            resu = GROUPA; k=i+1; lv=1
             while lv > 0:
                 cc = pattern[k]
-                if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
-                if cc == '(': lv+=1
-                elif cc == ')': lv-=1
+                if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
+                if cc == GROUPA: lv+=1
+                elif cc == GROUPO: lv-=1
                 resu+=cc; k+=1
             i=k; corro=0; corra=0
-            if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
-                resu+=pattern[i]; i+=1; corro=-1
-            if resu.startswith('(?:'): corra=2
+            if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE): 
+                resu+=pattern[i]; i+=1; corro=1
+            for s in (':','!','='): 
+                if resu.startswith(EXTENSION+s): corra = 1+len(s)
             if resu not in TokenListCache.keys():
-                TokenListCache[resu]=parse_pattern(resu[1+corra:-1+corro], True)  
+                TokenListCache[resu]=parse_pattern(resu[1+corra:-(1+corro)], True, True)
             tokens.append(resu)
             continue
         else:
             tokens.append(c); i+=1
-        if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
+        if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE): 
             tokens[-1] += pattern[i]; i+=1
-    TokenListCache[pattern]=tokens
+    if pattern not in TokenListCache.keys() and not by_recursion: 
+        TokenListCache[pattern]=tokens
     return tokens
 
 def listsplit(larg, item):
@@ -85,56 +98,68 @@
     resu.append(larg[ix:])
     return resu    
 
+def match_token(sarg, compix, token):
+    if not token[0]=='(': return match_simple_token(sarg[compix], token)
+    subtokens = listsplit(TokenListCache[token], ALTERNATIVE)
+    for i in range(len(subtokens)):
+        subcompix = try_index(sarg, compix, subtokens[i], True)
+        if subcompix: return True
+    return False 
+    
 def try_index(sarg, start, tokens, nested):
     compix = start; tix = 0; tkL = len(tokens)
     while tix<tkL:
         if compix == len(sarg):
             for s in tokens[tix:]:
-                if s[-1] not in ('?','*'): return 0
+                if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
             else: return compix
         ctk = tokens[tix]; qua=ctk[-1]
         if nested:
-            if ctk[0] == '(':
-                subtokens = listsplit(TokenListCache[ctk], '|')
+            if ctk[0] == GROUPA:
+                subtokens = listsplit(TokenListCache[ctk], ALTERNATIVE)
                 for i in range(len(subtokens)):
                     subcompix = try_index(sarg, compix, subtokens[i], True)
                     if subcompix: break
                 else:
-                    if qua in ('?', '*'): tix+=1; continue
-                    else: return 0                    
-                if qua in ('*', '+'):
-                    while subcompix:
+                    if qua in (ONEoZERO, ZEROoMORE) or ctk.startswith(EXTENSION+'!'): 
+                        tix+=1; continue
+                    else: return 0
+                if ctk.startswith(EXTENSION+'='): tix+=1; continue
+                elif ctk.startswith(EXTENSION+'!'): return 0   
+                if qua in (ZEROoMORE, ONEoMORE):
+                    while subcompix and not match_token(sarg, compix, tokens[tix+1]):
                         for i in range(len(subtokens)):
                             subcompix = try_index(sarg, subcompix, subtokens[i], True)
                             if subcompix: break
-                if not ctk.startswith('(?:'): xGroups.append(sarg[compix:subcompix])
+                if not ctk.startswith( EXTENSION+':' ): 
+                    xGroups.append(sarg[compix:subcompix])
                 compix=subcompix; tix+=1; continue
-            if ctk[0]=='\\' and '0'<= ctk[1] <='9':
+            if ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
                 i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
                 if sarg[compix:compix+lg]==captured:
                     compix+=lg
-                    if qua in ('+', '*'):
+                    if qua in (ZEROoMORE, ONEoMORE):
                         while sarg[compix:compix+lg]==captured: compix+=lg
                     tix+=1; continue
-                elif qua in ('?','*'): tix+=1; continue
+                elif qua in (ONEoZERO, ZEROoMORE): tix+=1; continue
                 else: return 0
-        if qua=='?':
-            if match_token(sarg[compix], ctk): compix+=1
+        if qua==ONEoZERO:
+            if match_simple_token(sarg[compix], ctk): compix+=1
             tix+=1; continue
-        if qua=='+':
-            if not match_token(sarg[compix], ctk): return 0
+        if qua==ONEoMORE:
+            if not match_simple_token(sarg[compix], ctk): return 0
             compix+=1
-        if qua in ('*','+'):
+        if qua in (ZEROoMORE, ONEoMORE):
             if tix<tkL-1:
-                while compix<len(sarg) and match_token(sarg[compix], ctk) and\
-                            not match_token(sarg[compix], tokens[tix+1]): 
+                while compix<len(sarg) and match_simple_token(sarg[compix], ctk) and\
+                            not match_token(sarg, compix, tokens[tix+1]): 
                     compix+=1
             else:
-                while compix<len(sarg) and match_token(sarg[compix], ctk):
+                while compix<len(sarg) and match_simple_token(sarg[compix], ctk):
                     compix+=1
             tix+=1  
         else:
-            if not match_token(sarg[compix], ctk): return 0
+            if not match_simple_token(sarg[compix], ctk): return 0
             compix+=1; tix+=1
     return compix
 
@@ -149,6 +174,17 @@
         argix += 1
     return ()
     
+def xfinditer(sarg, pattern, nested=False, start=-1):
+    try: tokens = TokenListCache[pattern]
+    except: tokens = parse_pattern(pattern, nested)
+    print(TokenListCache)
+    while start<len(sarg):
+        start += 1
+        if nested: global xGroups; xGroups=[]
+        n = try_index(sarg, start, tokens, nested)
+        if n: yield (start, n)
+    raise StopIteration()
+
 def xmatch(sarg, pattern, nested=False):
     """ checks, whether sarg as the whole matches the pattern """
     try: tokens = TokenListCache[pattern]
@@ -170,7 +206,7 @@
     while xpair:
         to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
     if nested:
-        for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
+        for i in range(10): subst=subst.replace(ESCAPE+str(i), xGroups[i])
     for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
     return s
 

History