# WARNING: Former revisions were not correct in damming greed of the star, they didn't stop
# "[^b]*[b]*" in xmatch("aaba", "a[^b]*b*aba") to eat up the second and third character
# returning mismatch. And also greed of ? is existing, which wasn't dammed at all.
# You can edit these constants
# To avoid backslash inflation for example or to use {...} for groups
ANYCHAR = '.'
ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
GROUPA = '(' ; GROUPO = ')'
ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
ALTERNATIVE = '|'
EXTENSION = '(?'
def match_region(ch, pattern):
""" region: a list of comparation chars and ranges in [] or [^]"""
if pattern[1]==COMPLEMENT: booly=False; booln=True; i=2
else: booly=True; booln=False; i=1
while i < len(pattern):
if pattern[i]==ESCAPE:
if pattern[i+1]==ch: return booly
else: i+=2
elif i<len(pattern)-2 and pattern[i+1]==RANGE:
if pattern[i]<=ch<=pattern[i+2]: return booly
else: i+=2
elif pattern[i]==ch: return booly
else: i+=1
return booln
def match_simple_token(ch, token):
c0=token[0]
if c0==ESCAPE:
c1=token[1]
if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
else: return (ch==c1)
elif c0==REGIONA: return match_region(ch,token)
elif c0==ANYCHAR: return True
else: return (ch==c0)
TokenListCache = {} # essential with nesting, otherwise only here for optimization
xGroups = []
def strip_groupattris(s):
for c in (':', '=', '!'):
if s.startswith(EXTENSION+c): s=s[3:]; break
else: s=s[1:]
if not s.endswith(')'): return s[:-2]
return s[:-1]
def parse_pattern(pattern, nested):
"""
tokens are:
1. patterns included in brackets (parsing is recursive)
2. regions
3. \\ together with the escaped character
4. periods
5. simple characters
All together with eventually trailing quantifiers
"""
tokens=[]; i=0
while i < len(pattern):
c = pattern[i]
if c==REGIONA:
k = pattern.find(REGIONO, i)
if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
while pattern[k-1] == ESCAPE:
k = pattern.find(']', k+1)
if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
tokens.append(pattern[i:k+1])
i=k+1
elif c == ANYCHAR:
tokens.append(ANYCHAR); i+=1
elif c == ESCAPE:
if i<len(pattern)-1: tokens.append(pattern[i:i+2]); i+=2
else: raise ValueError('Trailing '+ESCAPE)
elif nested and c==GROUPA:
resu = GROUPA; k=i+1; lv=1
while lv > 0:
cc = pattern[k]
if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
if cc == GROUPA: lv+=1
elif cc == GROUPO: lv-=1
resu+=cc; k+=1
i=k; tokens.append(resu)
else:
tokens.append(c); i+=1
# tokens done, quantifiers:
if i<len(pattern):
if pattern[i] in (ONEoZERO, ZEROoMORE): tokens[-1] += pattern[i]; i+=1
elif pattern[i] == ONEoMORE: tokens.append(tokens[-1]+ZEROoMORE); i+=1
# quantifiers done, recursive call for groups:
if nested:
ctk = tokens[-1]
if ctk[0]==GROUPA:
kore = strip_groupattris(ctk)
if kore not in TokenListCache.keys():
TokenListCache[kore]=parse_pattern(kore, True)
# registration in the TokenListCache:
if pattern not in TokenListCache.keys(): TokenListCache[pattern]=tokens
return tokens
def listsplit(larg):
resu = []; ix = 0
while ix<len(larg) and ALTERNATIVE in larg[ix:]:
ixx = larg.index(ALTERNATIVE); resu.append(larg[ix:ixx]); ix=ixx+1
resu.append(larg[ix:])
return resu
def dam_greed(sarg, cmpi, tokens, tix, nested):
tkL = len(tokens)
if not tix<tkL-1: return 0
i=1
while tokens[tix+i].endswith((ONEoZERO, ZEROoMORE)):
i+=1
if i==tkL: i=1; break
# we will simply test the next (quantified) token then
if nested and match_token(sarg, cmpi, tokens[tix+i]) or\
not nested and match_simple_token(sarg[cmpi], tokens[tix+i]):
return try_index(sarg, cmpi, tokens[tix+i:], nested)
return 0
def match_token(sarg, cmpi, token):
if token[0] != GROUPA: return match_simple_token(sarg[cmpi], token)
subtokens = listsplit(TokenListCache[strip_groupattris(token)])
for subtk in subtokens:
if try_index(sarg, cmpi, subtk, True): return True
return False
def try_index(sarg, start, tokens, nested):
cmpi=start; tix=0; tkL=len(tokens)
while tix<tkL:
if cmpi==len(sarg):
for s in tokens[tix:]:
if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
else: return cmpi
ctk = tokens[tix]
if ctk in (ESCAPE+ZEROoMORE, ESCAPE+ONEoMORE, ESCAPE+ONEoZERO): qua='n'
else: qua=ctk[-1]
if qua in (ZEROoMORE, ONEoZERO): ctk=ctk[:-1]
if nested:
if ctk[0] == GROUPA:
subtokens = listsplit(TokenListCache[strip_groupattris(ctk)])
for subtk in subtokens:
subcmpi = try_index(sarg, cmpi, subtk, True)
if subcmpi: break
else:
if qua in (ONEoZERO, ZEROoMORE) or ctk.startswith(EXTENSION+'!'):
tix+=1; continue
else: return 0
if ctk.startswith(EXTENSION+'='): tix+=1; continue
elif ctk.startswith(EXTENSION+'!'): return 0
if qua == ONEoZERO:
n = dam_greed(sarg, subcmpi, tokens, tix, nested)
if n: return n
if qua == ZEROoMORE:
while subcmpi:
n = dam_greed(sarg, subcmpi, tokens, tix, nested)
if n: return n
subcmpi = try_index(sarg, subcmpi, subtk, True)
if subcmpi: break
if not ctk.startswith(EXTENSION+':'): xGroups.append(sarg[cmpi:subcmpi])
cmpi=subcmpi; tix+=1; continue
# any group token done, back reference:
if ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
if sarg[cmpi:cmpi+lg]==captured:
if qua==ZEROoMORE:
while sarg[cmpi:cmpi+lg]==captured:
n = dam_greed(sarg, subcmpi, tokens, tix, nested)
if n: return n
cmpi+=lg
elif qua == ONEoZERO:
n = dam_greed(sarg, subcmpi, tokens, tix, nested)
if n: return n
cmpi+=lg
tix+=1; continue
else: return 0
# from here on match_simple_token is enough, tix points to a non-group token
if qua==ONEoZERO:
n = dam_greed(sarg, cmpi, tokens, tix, nested)
if n: return n
if match_simple_token(sarg[cmpi], ctk): cmpi+=1
tix+=1
if qua==ZEROoMORE:
while cmpi<len(sarg) and match_simple_token(sarg[cmpi], ctk):
n = dam_greed(sarg, cmpi, tokens, tix, nested)
if n: return n
cmpi+=1
tix+=1
else:
if not match_simple_token(sarg[cmpi], ctk): return 0
cmpi+=1; tix+=1
return cmpi
def xsearch(sarg, pattern, nested=False, start=0):
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
argix=start
while argix<len(sarg):
if nested: global xGroups; xGroups=[]
n = try_index(sarg, argix, tokens, nested)
if n: return (argix, n)
argix += 1
return ()
def xfinditer(sarg, pattern, nested=False, start=0):
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
n=0
while start<len(sarg):
if n: start=n
if nested: global xGroups; xGroups=[]
n = try_index(sarg, start, tokens, nested)
if n: yield (start, n)
else: start+=1
raise StopIteration()
def xmatch(sarg, pattern, nested=False):
""" checks, whether sarg as the whole matches the pattern """
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
if nested: global xGroups; xGroups=[]
n = try_index(sarg, 0, tokens, nested)
if n==len(sarg): return True
return False
def xsplit(sarg, pattern, nested=False):
resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
while xpair:
resu.append(sarg[:xpair[0]])
residue = xpair[1]
xpair = xsearch(sarg, pattern, nested, xpair[1])
return resu+sarg[residue:]
def xreplace(sarg, pattern, subst, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
if nested:
for i in range(10):
try: subst=subst.replace(ESCAPE+str(i), xGroups[i])
except IndexError: break
for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]
return s
def xfuncreplace(sarg, pattern, f, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
for xpair in reversed(to_replace):
s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]
return s
Diff to Previous Revision
--- revision 23 2010-06-10 20:45:00
+++ revision 24 2010-06-10 21:21:08
@@ -1,10 +1,9 @@
-# You can edit these constants
-# To avoid backslash inflation for example or to use {...} for groups
-
# WARNING: Former revisions were not correct in damming greed of the star, they didn't stop
# "[^b]*[b]*" in xmatch("aaba", "a[^b]*b*aba") to eat up the second and third character
# returning mismatch. And also greed of ? is existing, which wasn't dammed at all.
+# You can edit these constants
+# To avoid backslash inflation for example or to use {...} for groups
ANYCHAR = '.'
ESCAPE = '\\'
@@ -139,8 +138,7 @@
ctk = tokens[tix]
if ctk in (ESCAPE+ZEROoMORE, ESCAPE+ONEoMORE, ESCAPE+ONEoZERO): qua='n'
else: qua=ctk[-1]
- quantified = (qua in (ZEROoMORE, ONEoZERO))
- if quantified: ctk=ctk[:-1]
+ if qua in (ZEROoMORE, ONEoZERO): ctk=ctk[:-1]
if nested:
if ctk[0] == GROUPA:
subtokens = listsplit(TokenListCache[strip_groupattris(ctk)])