Welcome, guest | Sign In | My Account | Store | Cart
def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]=='^': booly = False; booln = True; i=2
    else: booly = True; booln = False; i=1
    while i < len(pattern):
        if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
            if ch== pattern[i+1]: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]=='-':
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+= 2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_token(ch, token):
    c0 = token[0]
    if c0=='[': return match_region(ch,token)
    elif c0=='.': return True
    elif c0=='\\':
        c1=token[1]
        if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==c1)
    else: return (ch==c0)

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
xGroups = []

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    All with eventually trailing quantifiers
    """
    tokens = []; i=0
    while i < len(pattern):
        c = pattern[i]
        if c=='[':
            k = pattern.find(']', i)
            if k==-1: raise ValueError('Unmatched [ in '+pattern)
            while pattern[k-1] == '\\':
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched [ in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == '.':
            tokens.append('.'); i+=1
        elif c == '\\':
            if i<len(pattern)-1:
                tokens.append(pattern[i:i+2]); i+=2
            else:
                raise ValueError('Bad successor of a backslash')
        elif nested and c=='(':
            resu = '('; k=i+1; lv=1 
            while lv > 0:
                cc = pattern[k]
                if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == '(': lv+=1
                elif cc == ')': lv-=1
                resu+=cc; k+=1
            i=k
            if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
                resu+=pattern[i]; i+=1
                if resu not in TokenListCache.keys():
                    TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
            elif resu not in TokenListCache.keys():
                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)  
            tokens.append(resu)
            continue
        else:
            tokens.append(c); i+=1
        if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
            tokens[-1] += pattern[i]; i+=1
    TokenListCache[pattern]=tokens
    return tokens

def try_index(sarg, start, tokens, nested):
    compix = start; tix = 0; tkL = len(tokens)
    while tix<tkL:
        if compix == len(sarg): return 0
        ctk = tokens[tix]; qua=ctk[-1]
        if nested :
            if ctk[0] == '(':
                subtokens = TokenListCache[ctk]
                subcompix = try_index(sarg, compix, subtokens, True)
                bak = subcompix
                if not subcompix:
                    if qua in ('?', '*'): tix+=1; continue
                    else: return 0
                elif qua in ('*', '+'):
                    while subcompix:
                        bak = subcompix
                        subcompix = try_index(sarg, subcompix, subtokens, True)
                tix+=1
                xGroups.append(sarg[compix:bak])
                compix=bak
                continue
            if ctk[0]=='\\' and '0'<=ctk[1]<='9':
                i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
                if sarg[compix:compix+lg]==captured:
                    compix+=lg
                    if qua in ('+', '*'):
                        while sarg[compix:compix+lg]==captured: compix+=lg
                    tix+=1; continue
                elif qua in ('?','*'): tix+=1; continue
                else: return 0
        if qua=='?':
            if match_token(sarg[compix], ctk): tix+=1; compix+=1
            continue
        if qua=='+':
            if not match_token(sarg[compix], ctk): return 0
            compix+=1
        if qua in ('*','+'):
            if tix<tkL-1:
                while compix<len(sarg) and match_token(sarg[compix], ctk) and\
                            not match_token(sarg[compix], tokens[tix+1]): 
                    compix+=1
            else:
                while compix<len(sarg) and match_token(sarg[compix], ctk):
                    compix+=1
            tix+=1  
        else:
            if not match_token(sarg[compix], ctk): return 0
            compix+=1; tix+=1
    return compix

def xsearch(sarg, pattern, start=0, nested=False):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        if nested: global xGroups; xGroups=[]
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return (0,0)

def xplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while (xpair[0]):
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, xpair[1], nested)
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, subst, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
    while (xpair[0]):
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
    if nested:
        for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
    for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
    while (xpair[0]):
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
    for xpair in reversed(to_replace): 
        s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 6 2010-06-02 19:04:12
+++ revision 7 2010-06-02 19:21:33
@@ -69,7 +69,7 @@
                 if resu not in TokenListCache.keys():
                     TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
             elif resu not in TokenListCache.keys():
-                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)               
+                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)  
             tokens.append(resu)
             continue
         else:
@@ -162,5 +162,5 @@
     while (xpair[0]):
         to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
     for xpair in reversed(to_replace): 
-        s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
+        s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
     return s

History