def match_region(ch, pattern):
""" region: a list of comparation chars and ranges in [] or [^]"""
booly = True; booln = False; i=1
if pattern[1]=='^': booly = False; booln = True; i=2
while i < len(pattern):
if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
if ch== pattern[i+1]: return booly
else: i+=2
elif i<len(pattern)-2 and pattern[i+1]=='-':
if pattern[i]<=ch<=pattern[i+2]: return booly
else: i+= 2
elif pattern[i]==ch: return booly
else: i+=1
return booln
def match_token(ch, token):
if token.startswith('['): return match_region(ch,token)
elif token.startswith('\\'): return (ch==token[1])
elif token=='.': return True
else: return (ch==token[0])
TokenListCache = {} # essential with nesting, otherwise only here for optimization
to_escape = ('.','*','+','?','\\','[',']'); esc_brackets = to_escape + ('(', ')')
def parse_pattern(pattern, nested):
"""
tokens are:
1. patterns included in brackets (parsing is recursive)
2. regions
3. \\ together with the escaped character
4. periods
5. simple characters
6. quantifiers, only following tokens 1-5
"""
if nested: ESC = esc_brackets
else: ESC = to_escape
tokens = []
i = 0
while i < len(pattern):
c = pattern[i]
if c=='[':
k = pattern.find(']', i)
if k==-1: raise ValueError('Unmatched [ in '+pattern)
while pattern[k-1] == '\\':
k = pattern.find(']', k+1)
if k==-1: raise ValueError('Unmatched [ in '+pattern)
tokens.append(pattern[i:k+1])
i=k+1
elif c == '.':
tokens.append('.'); i+=1
elif c == '\\':
if i<len(pattern)-1 and pattern[i+1] in ESC:
tokens.append(pattern[i:i+2]); i+=2
else:
raise ValueError('Bad successor of a backslash')
elif nested and c=='(':
resu = '('; k=i+1; lv=1
while lv > 0:
cc = pattern[k]
if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
if cc == '(': lv+=1
elif cc == ')': lv-=1
resu+=cc; k+=1
if resu not in TokenListCache.keys():
TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
tokens.append(resu)
if k<len(pattern) and pattern[k] in ('*', '+', '?'):
tokens.append(pattern[k]); k+=1
i = k+1
else:
tokens.append(c); i+=1
if i<len(pattern) and pattern[i] in ('*', '+', '?'):
tokens.append(pattern[i]); i+=1
TokenListCache[pattern]=tokens
return tokens
def try_index(sarg, start, tokens, nested):
compix = start; tix = 0; tkL = len(tokens)
while tix<tkL:
if compix == len(sarg): return 0
ctk = tokens[tix]
if nested and ctk[0] == '(':
subtokens = TokenListCache[ctk]
subcompix = try_index(sarg, compix, subtokens, nested)
c = 'n'
if tix<tkL-1: c = tokens[tix+1]
if not subcompix:
if c in ('?', '*'): tix+=2; compix=subcompix; continue
else: return 0
elif c in ('*', '+'):
while subcompix:
compix = subcompix
subcompix = try_index(sarg, compix, subtokens, nested)
tix+=2; continue
qua = 'n'
if tix<tkL-1: qua = tokens[tix+1]
if qua=='?':
if match_token(sarg[compix], ctk): tix+=2; compix+=1
continue
if qua=='+':
if not match_token(sarg[compix], ctk): return 0
compix+=1
if qua in ('*','+'):
if tix<tkL-2:
while compix<len(sarg) and match_token(sarg[compix], ctk) and\
not match_token(sarg[compix], tokens[tix+2]):
compix+=1
else:
while compix<len(sarg) and match_token(sarg[compix], ctk):
compix+=1
tix+=2
else:
if not match_token(sarg[compix], ctk): return 0
else: compix+=1; tix+=1
return compix
def xsearch(sarg, pattern, start=0, nested=False):
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
argix=start
while argix<len(sarg):
n = try_index(sarg, argix, tokens, nested)
if n: return (argix, n)
argix += 1
return (0,0)
Diff to Previous Revision
--- revision 2 2010-06-01 18:59:54
+++ revision 3 2010-06-01 20:32:42
@@ -1,5 +1,5 @@
def match_region(ch, pattern):
- """ region: a list of comparation chars and ranges in [] or [^] """
+ """ region: a list of comparation chars and ranges in [] or [^]"""
booly = True; booln = False; i=1
if pattern[1]=='^': booly = False; booln = True; i=2
while i < len(pattern):
@@ -74,40 +74,14 @@
TokenListCache[pattern]=tokens
return tokens
-def try_sindex(sarg, start, tokens): # s for "simple"
- compix = start; tix = 0; tkL = len(tokens)
- while tix<tkL:
- if compix == len(sarg): return 0
- qua = 'n'
- if tix<tkL-1: qua = tokens[tix+1]
- if qua=='?':
- if match_token(sarg[compix], tokens[tix]): tix+=2; compix+=1
- continue
- if qua=='+':
- if not match_token(sarg[compix], tokens[tix]): return 0
- compix+=1
- if qua in ('*','+'):
- if tix<tkL-2:
- while compix<len(sarg) and match_token(sarg[compix], tokens[tix]) and\
- not match_token(sarg[compix], tokens[tix+2]):
- compix+=1
- else:
- while compix<len(sarg) and match_token(sarg[compix], tokens[tix]):
- compix+=1
- tix+=2
- else:
- if not match_token(sarg[compix], tokens[tix]): return 0
- else: compix+=1; tix+=1
- return compix
-
-def try_nindex(sarg, start, tokens):
+def try_index(sarg, start, tokens, nested):
compix = start; tix = 0; tkL = len(tokens)
while tix<tkL:
if compix == len(sarg): return 0
ctk = tokens[tix]
- if ctk[0] == '(':
+ if nested and ctk[0] == '(':
subtokens = TokenListCache[ctk]
- subcompix = try_nindex(sarg, compix, subtokens)
+ subcompix = try_index(sarg, compix, subtokens, nested)
c = 'n'
if tix<tkL-1: c = tokens[tix+1]
if not subcompix:
@@ -116,7 +90,7 @@
elif c in ('*', '+'):
while subcompix:
compix = subcompix
- subcompix = try_nindex(sarg, compix, subtokens)
+ subcompix = try_index(sarg, compix, subtokens, nested)
tix+=2; continue
qua = 'n'
if tix<tkL-1: qua = tokens[tix+1]
@@ -144,10 +118,8 @@
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
argix=start
- if not nested: try_index=try_sindex
- else: try_index=try_nindex
while argix<len(sarg):
- n = try_index(sarg, argix, tokens)
+ n = try_index(sarg, argix, tokens, nested)
if n: return (argix, n)
argix += 1
return (0,0)