Welcome, guest | Sign In | My Account | Store | Cart
def match_region(ch, pattern):
    """ region: a list of comparation chars and ranges in [] or [^]"""
    booly = True; booln = False; i=1
    if pattern[1]=='^': booly = False; booln = True; i=2
    while i < len(pattern):
        if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
            if ch== pattern[i+1]: return booly
            else: i+=2
        elif i<len(pattern)-2 and pattern[i+1]=='-':
            if pattern[i]<=ch<=pattern[i+2]: return booly
            else: i+= 2
        elif pattern[i]==ch: return booly
        else: i+=1
    return booln

def match_token(ch, token):
    if token.startswith('['): return match_region(ch,token)
    elif token.startswith('\\'): return (ch==token[1])
    elif token=='.': return True
    else: return (ch==token[0])

TokenListCache = {}  # essential with nesting, otherwise only here for optimization
to_escape =  ('.','*','+','?','\\','[',']'); esc_brackets = to_escape + ('(', ')')

def parse_pattern(pattern, nested):
    """ 
    tokens are: 
    1. patterns included in brackets (parsing is recursive)
    2. regions
    3. \\ together with the escaped character
    4. periods
    5. simple characters
    6. quantifiers, only following tokens 1-5
    """
    if nested: ESC = esc_brackets
    else: ESC = to_escape
    tokens = []
    i = 0
    while i < len(pattern):
        c = pattern[i]
        if c=='[':
            k = pattern.find(']', i)
            if k==-1: raise ValueError('Unmatched [ in '+pattern)
            while pattern[k-1] == '\\':
                k = pattern.find(']', k+1)
                if k==-1: raise ValueError('Unmatched [ in '+pattern)
            tokens.append(pattern[i:k+1])
            i=k+1
        elif c == '.':
            tokens.append('.'); i+=1
        elif c == '\\':
            if i<len(pattern)-1 and pattern[i+1] in  ESC:
                tokens.append(pattern[i:i+2]); i+=2
            else:
                raise ValueError('Bad successor of a backslash')
        elif nested and c=='(':
            resu = '('; k=i+1; lv=1 
            while lv > 0:
                cc = pattern[k]
                if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
                if cc == '(': lv+=1
                elif cc == ')': lv-=1
                resu+=cc; k+=1
            if resu not in TokenListCache.keys():
                TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
            tokens.append(resu)
            if k<len(pattern) and pattern[k] in ('*', '+', '?'):
                tokens.append(pattern[k]); k+=1
            i = k+1
        else:
            tokens.append(c); i+=1
        if i<len(pattern) and pattern[i] in ('*', '+', '?'): 
            tokens.append(pattern[i]); i+=1
    TokenListCache[pattern]=tokens
    return tokens

def try_index(sarg, start, tokens, nested):
    compix = start; tix = 0; tkL = len(tokens)
    while tix<tkL:
        if compix == len(sarg): return 0
        ctk = tokens[tix]
        if nested and ctk[0] == '(':
            subtokens = TokenListCache[ctk]
            subcompix = try_index(sarg, compix, subtokens, nested)
            c = 'n'
            if tix<tkL-1: c = tokens[tix+1]
            if not subcompix:
                if c in ('?', '*'): tix+=2; compix=subcompix; continue
                else: return 0
            elif c in ('*', '+'):
                while subcompix:
                    compix = subcompix
                    subcompix = try_index(sarg, compix, subtokens, nested)
                tix+=2; continue
        qua = 'n'
        if tix<tkL-1: qua = tokens[tix+1]
        if qua=='?':
            if match_token(sarg[compix], ctk): tix+=2; compix+=1
            continue
        if qua=='+':
            if not match_token(sarg[compix], ctk): return 0
            compix+=1
        if qua in ('*','+'):
            if tix<tkL-2:
                while compix<len(sarg) and match_token(sarg[compix], ctk) and\
                            not match_token(sarg[compix], tokens[tix+2]): 
                    compix+=1
            else:
                while compix<len(sarg) and match_token(sarg[compix], ctk):
                    compix+=1
            tix+=2                    
        else:
            if not match_token(sarg[compix], ctk): return 0
            else: compix+=1; tix+=1
    return compix

def xsearch(sarg, pattern, start=0, nested=False):
    try: tokens = TokenListCache[pattern]
    except: tokens = parse_pattern(pattern, nested)
    argix=start
    while argix<len(sarg):
        n = try_index(sarg, argix, tokens, nested)
        if n: return (argix, n)
        argix += 1
    return (0,0)

Diff to Previous Revision

--- revision 2 2010-06-01 18:59:54
+++ revision 3 2010-06-01 20:32:42
@@ -1,5 +1,5 @@
 def match_region(ch, pattern):
-    """ region: a list of comparation chars and ranges in [] or [^] """
+    """ region: a list of comparation chars and ranges in [] or [^]"""
     booly = True; booln = False; i=1
     if pattern[1]=='^': booly = False; booln = True; i=2
     while i < len(pattern):
@@ -74,40 +74,14 @@
     TokenListCache[pattern]=tokens
     return tokens
 
-def try_sindex(sarg, start, tokens):  # s for "simple"
-    compix = start; tix = 0; tkL = len(tokens)
-    while tix<tkL:
-        if compix == len(sarg): return 0
-        qua = 'n'
-        if tix<tkL-1: qua = tokens[tix+1]       
-        if qua=='?':
-            if match_token(sarg[compix], tokens[tix]): tix+=2; compix+=1
-            continue
-        if qua=='+':
-            if not match_token(sarg[compix], tokens[tix]): return 0
-            compix+=1
-        if qua in ('*','+'):
-            if tix<tkL-2:
-                while compix<len(sarg) and match_token(sarg[compix], tokens[tix]) and\
-                            not match_token(sarg[compix], tokens[tix+2]): 
-                    compix+=1
-            else:
-                while compix<len(sarg) and match_token(sarg[compix], tokens[tix]):
-                    compix+=1
-            tix+=2                    
-        else:
-            if not match_token(sarg[compix], tokens[tix]): return 0
-            else: compix+=1; tix+=1
-    return compix
-
-def try_nindex(sarg, start, tokens):
+def try_index(sarg, start, tokens, nested):
     compix = start; tix = 0; tkL = len(tokens)
     while tix<tkL:
         if compix == len(sarg): return 0
         ctk = tokens[tix]
-        if ctk[0] == '(':
+        if nested and ctk[0] == '(':
             subtokens = TokenListCache[ctk]
-            subcompix = try_nindex(sarg, compix, subtokens)
+            subcompix = try_index(sarg, compix, subtokens, nested)
             c = 'n'
             if tix<tkL-1: c = tokens[tix+1]
             if not subcompix:
@@ -116,7 +90,7 @@
             elif c in ('*', '+'):
                 while subcompix:
                     compix = subcompix
-                    subcompix = try_nindex(sarg, compix, subtokens)
+                    subcompix = try_index(sarg, compix, subtokens, nested)
                 tix+=2; continue
         qua = 'n'
         if tix<tkL-1: qua = tokens[tix+1]
@@ -144,10 +118,8 @@
     try: tokens = TokenListCache[pattern]
     except: tokens = parse_pattern(pattern, nested)
     argix=start
-    if not nested: try_index=try_sindex 
-    else: try_index=try_nindex
     while argix<len(sarg):
-        n = try_index(sarg, argix, tokens)
+        n = try_index(sarg, argix, tokens, nested)
         if n: return (argix, n)
         argix += 1
     return (0,0)

History