Welcome, guest | Sign In | My Account | Store | Cart
def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    if pattern[1]=='^': booly = False; booln = True; i=2
    else: booly = True; booln = False; i=1
    while i < len(pattern):
        if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
            if ch== pattern[i+1]: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]=='-':
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+= 2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_token(ch, token):
    if token.startswith('['): return match_region(ch,token)
    elif token.startswith('\\'):
        tkc=token[1]
        if tkc=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
        elif tkc=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
        else: return (ch==tkc)
    elif token=='.': return True
    else: return (ch==token[0])

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
to_escape =  ('.','*','+','?','\\','[',']'); esc_brackets = to_escape + ('(', ')')

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    6. quantifiers, only following tokens 1-5
    """
    if nested: ESC = esc_brackets
    else: ESC = to_escape
    tokens = []
    i = 0
    while i < len(pattern):
        c = pattern[i]
        if c=='[':
            k = pattern.find(']', i)
            if k==-1: raise ValueError('Unmatched [ in '+pattern)
            while pattern[k-1] == '\\':
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched [ in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == '.':
            tokens.append('.'); i+=1
        elif c == '\\':
            if i<len(pattern)-1 and pattern[i+1] in  ESC:
                tokens.append(pattern[i:i+2]); i+=2
            else:
                raise ValueError('Bad successor of a backslash')
        elif nested and c=='(':
            resu = '('; k=i+1; lv=1 
            while lv > 0:
                cc = pattern[k]
                if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == '(': lv+=1
                elif cc == ')': lv-=1
                resu+=cc; k+=1
            if resu not in TokenListCache.keys():
                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
            tokens.append(resu)
            if k<len(pattern) and pattern[k] in ('*', '+', '?'):
                tokens.append(pattern[k]); k+=1
            i = k+1
        else:
            tokens.append(c); i+=1
        if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
            tokens.append(pattern[i]); i+=1
    TokenListCache[pattern]=tokens
    return tokens

def try_index(sarg, start, tokens, nested):
    compix = start; tix = 0; tkL = len(tokens)
    while tix<tkL:
        if compix == len(sarg): return 0
        ctk = tokens[tix]
        if nested and ctk[0] == '(':
            subtokens = TokenListCache[ctk]
            subcompix = try_index(sarg, compix, subtokens, True)
            c = 'n'
            if tix<tkL-1: c = tokens[tix+1]
            if not subcompix:
                if c in ('?', '*'): tix+=2; compix=subcompix; continue
                else: return 0
            elif c in ('*', '+'):
                while subcompix:
                    compix = subcompix
                    subcompix = try_index(sarg, compix, subtokens, nested)
                tix+=2; continue
        qua = 'n'
        if tix<tkL-1: qua = tokens[tix+1]
        if qua=='?':
            if match_token(sarg[compix], ctk): tix+=2; compix+=1
            continue
        if qua=='+':
            if not match_token(sarg[compix], ctk): return 0
            compix+=1
        if qua in ('*','+'):
            if tix<tkL-2:
                while compix<len(sarg) and match_token(sarg[compix], ctk) and\
                            not match_token(sarg[compix], tokens[tix+2]): 
                    compix+=1
            else:
                while compix<len(sarg) and match_token(sarg[compix], ctk):
                    compix+=1
            tix+=2                    
        else:
            if not match_token(sarg[compix], ctk): return 0
            else: compix+=1; tix+=1
    return compix

def xsearch(sarg, pattern, start=0, nested=False):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return (0,0)

def xplit(sarg, pattern, nested=False):
    resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
    while (xpair[0]):
        resu.append(sarg[:xpair[0]])
        residue = xpair[1]
        xpair = xsearch(sarg, pattern, xpair[1], nested)
    return resu+sarg[residue:]
    
def xreplace(sarg, pattern, f, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
    while (xpair[0]):
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
    for xpair in reversed(to_replace):
        s = s[:xpair[0]]+f+s[xpair[1]:]        
    return s

def xfuncreplace(sarg, pattern, f, nested=False):
    to_replace=[];  s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
    while (xpair[0]):
        to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
    for xpair in reversed(to_replace):
        s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
    return s

Diff to Previous Revision

--- revision 4 2010-06-02 09:50:24
+++ revision 5 2010-06-02 09:59:56
@@ -16,10 +16,10 @@
 def match_token(ch, token):
     if token.startswith('['): return match_region(ch,token)
     elif token.startswith('\\'):
-        tkc==token[1]
+        tkc=token[1]
         if tkc=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
         elif tkc=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
-        else return (ch==tkc)
+        else: return (ch==tkc)
     elif token=='.': return True
     else: return (ch==token[0])
 
@@ -149,5 +149,5 @@
     while (xpair[0]):
         to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
     for xpair in reversed(to_replace):
-        s[:xpair[0]]+f(s[xpair[0]:xpair[1])+s[xpair[1]:]   
+        s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]   
     return s

History