# WARNING: Former revisions were not correct in damming greed of the star, they didn't stop
# "[^b]*b*" in xmatch("aaba", "a[^b]*b*aba") to eat up the second and third character
# returning mismatch. And also greed of ? is existing, which wasn't dammed at all.
ANYCHAR = '.'; ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
GROUPA = '(' ; GROUPO = ')'; ALTERNATIVE = '|'
PERHAPS = '?' ; STAR = '*'; JUST_ONE_and_STAR = '+'
EXTENSION = '(?'
def match_region(ch, pattern):
""" region: a list of comparation chars and ranges in [] or [^]"""
if pattern[1]==COMPLEMENT: booly=False; booln=True; i=2
else: booly=True; booln=False; i=1
while i < len(pattern):
if pattern[i]==ESCAPE:
if pattern[i+1]==ch: return booly
else: i+=2
elif i<len(pattern)-2 and pattern[i+1]==RANGE:
if pattern[i]<=ch<=pattern[i+2]: return booly
else: i+=2
elif pattern[i]==ch: return booly
else: i+=1
return booln
def match_simple_token(ch, token):
c0=token[0]
if c0==ESCAPE:
c1=token[1]
if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
else: return (ch==c1)
elif c0==REGIONA: return match_region(ch,token)
elif c0==ANYCHAR: return True
else: return (ch==c0)
TokenListCache = {} # essential with nesting, otherwise only here for optimization
xGroups = []
def strip_groupattris(s):
for c in (':', '=', '!'):
if s.startswith(EXTENSION+c): s=s[3:]; break
else: s=s[1:]
if not s.endswith(')'): return s[:-2]
return s[:-1]
def parse_pattern(pattern, nested):
"""
tokens are:
1. patterns included in brackets (parsing is recursive)
2. regions
3. \\ together with the escaped character
4. periods
5. simple characters
All together with eventually trailing quantifiers
"""
if pattern in TokenListCache.keys(): return TokenListCache[pattern]
tokens=[]; i=0; pL = len(pattern)
while i < pL:
c = pattern[i]
if c==REGIONA:
k = pattern.find(REGIONO, i)
if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
while pattern[k-1] == ESCAPE:
k = pattern.find(REGIONO, k+1)
if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
tokens.append(pattern[i:k+1]); i=k+1
elif c == ANYCHAR: tokens.append(ANYCHAR); i+=1
elif c == ESCAPE:
if i<pL-1: tokens.append(pattern[i:i+2]); i+=2
else: raise ValueError('Trailing '+ESCAPE)
elif nested and c==GROUPA:
resu = GROUPA; k=i+1; lv=1
while lv > 0:
cc = pattern[k]
if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
if cc == GROUPA: lv+=1
elif cc == GROUPO: lv-=1
resu+=cc; k+=1
tokens.append(resu); i=k; kore=strip_groupattris(resu)
if resu not in TokenListCache.keys():
TokenListCache[resu] = []
# groups are parsed to lists of token lists, each an alternative from '|'
if kore[0] != GROUPA:
for s in kore.split(ALTERNATIVE):
TokenListCache[resu].append(parse_pattern(s, True))
else:
TokenListCache[resu].append(parse_pattern(kore, True))
else: tokens.append(c); i+=1
if i<pL:
if pattern[i]==PERHAPS: tokens[-1]=(tokens[-1],PERHAPS); i+=1
elif pattern[i]==STAR: tokens[-1]=(tokens[-1],STAR); i+=1
elif pattern[i]==JUST_ONE_and_STAR:
tokens.append((tokens[-1],STAR)); tokens[-2]=(tokens[-2],None); i+=1
else: tokens[-1] = (tokens[-1],None)
else: tokens[-1] = (tokens[-1],None)
TokenListCache[pattern]=tokens; return tokens
def try_index(sarg, start, tokens, nested):
cmpi=start; tkL=len(tokens); L=len(sarg)
for tix in range(tkL):
if cmpi==L:
for pair in tokens[tix:]:
if not pair[1]: return 0
else: return cmpi
ctk, qua = tokens[tix]
if qua and tix<tkL-1: # dam greed first, same code as below in *-loops again
n = try_index(sarg, cmpi, tokens[tix+1:], nested)
if n: return n
if nested:
if ctk[0] == GROUPA:
for subtk in TokenListCache[ctk]:
subcmpi = try_index(sarg, cmpi, subtk, True)
if subcmpi: break
else:
if qua or ctk.startswith(EXTENSION+'!'): continue
else: return 0
if ctk.startswith(EXTENSION+'='): continue
elif ctk.startswith(EXTENSION+'!'): return 0
if qua==STAR:
while subcmpi and subcmpi<L:
if tix<tkL-1:
n = try_index(sarg, subcmpi, tokens[tix+1:], nested)
if n: return n
for subtk in TokenListCache[ctk]:
subcmpi = try_index(sarg, subcmpi, subtk, True)
if subcmpi: break
else: break
if not ctk.startswith(EXTENSION+':'): xGroups.append(sarg[cmpi:subcmpi])
cmpi=subcmpi; continue # any group token done, back reference:
elif ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
if sarg[cmpi:cmpi+lg]==captured:
cmpi+=lg
if qua==STAR:
while sarg[cmpi:cmpi+lg]==captured and cmpi<L:
if tix<tkL-1:
n = try_index(sarg, cmpi, tokens[tix+1:], True)
if n: return n
cmpi+=lg
elif not qua: return 0
continue # from here on tix points to a simple token:
if match_simple_token(sarg[cmpi], ctk):
cmpi+=1
if qua==STAR:
while match_simple_token(sarg[cmpi], ctk) and cmpi<L:
if tix<tkL-1:
n = try_index(sarg, cmpi, tokens[tix+1:], nested)
if n: return n
cmpi+=1
elif not qua: return 0
return cmpi
def xsearch(sarg, pattern, nested=False, start=0):
tokens = parse_pattern(pattern, nested); argi=start
while argi<len(sarg):
if nested: global xGroups; xGroups=[]
n = try_index(sarg, argi, tokens, nested)
if n: return (argi, n)
argi+=1
return ()
def xfinditer(sarg, pattern, nested=False, start=0):
tokens = parse_pattern(pattern, nested); n=0
while start<len(sarg):
if n: start=n
if nested: global xGroups; xGroups=[]
n = try_index(sarg, start, tokens, nested)
if n: yield (start, n)
else: start+=1
raise StopIteration()
def xmatch(sarg, pattern, nested=False):
""" checks, whether sarg as the whole matches the pattern """
tokens = parse_pattern(pattern, nested)
if nested: global xGroups; xGroups=[]
n = try_index(sarg, 0, tokens, nested)
if n==len(sarg): return True
return False
def xsplit(sarg, pattern, nested=False):
resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
while xpair:
resu.append(sarg[:xpair[0]])
residue = xpair[1]
xpair = xsearch(sarg, pattern, nested, xpair[1])
return resu+sarg[residue:]
def xreplace(sarg, pattern, subst, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
if nested:
for i in range(10):
try: subst=subst.replace(ESCAPE+str(i), xGroups[i])
except IndexError: break
for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]
return s
def xfuncreplace(sarg, pattern, f, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
for xpair in reversed(to_replace):
s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]
return s
Diff to Previous Revision
--- revision 33 2010-06-12 00:17:40
+++ revision 34 2010-06-12 17:41:37
@@ -2,12 +2,10 @@
# "[^b]*b*" in xmatch("aaba", "a[^b]*b*aba") to eat up the second and third character
# returning mismatch. And also greed of ? is existing, which wasn't dammed at all.
-ANYCHAR = '.'
-ESCAPE = '\\'
+ANYCHAR = '.'; ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
-GROUPA = '(' ; GROUPO = ')'
-ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
-ALTERNATIVE = '|'
+GROUPA = '(' ; GROUPO = ')'; ALTERNATIVE = '|'
+PERHAPS = '?' ; STAR = '*'; JUST_ONE_and_STAR = '+'
EXTENSION = '(?'
def match_region(ch, pattern):
@@ -90,39 +88,37 @@
TokenListCache[resu].append(parse_pattern(kore, True))
else: tokens.append(c); i+=1
if i<pL:
- if pattern[i] in (ONEoZERO, ZEROoMORE): tokens[-1] += pattern[i]; i+=1
- elif pattern[i] == ONEoMORE: tokens.append(tokens[-1]+ZEROoMORE); i+=1
+ if pattern[i]==PERHAPS: tokens[-1]=(tokens[-1],PERHAPS); i+=1
+ elif pattern[i]==STAR: tokens[-1]=(tokens[-1],STAR); i+=1
+ elif pattern[i]==JUST_ONE_and_STAR:
+ tokens.append((tokens[-1],STAR)); tokens[-2]=(tokens[-2],None); i+=1
+ else: tokens[-1] = (tokens[-1],None)
+ else: tokens[-1] = (tokens[-1],None)
TokenListCache[pattern]=tokens; return tokens
def try_index(sarg, start, tokens, nested):
- cmpi=start; tkL=len(tokens)
+ cmpi=start; tkL=len(tokens); L=len(sarg)
for tix in range(tkL):
- if cmpi==len(sarg):
- for s in tokens[tix:]:
- if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
+ if cmpi==L:
+ for pair in tokens[tix:]:
+ if not pair[1]: return 0
else: return cmpi
- ctk = tokens[tix]
- if ctk in (ESCAPE+ZEROoMORE, ESCAPE+ONEoMORE, ESCAPE+ONEoZERO): qua='n'
- else: qua=ctk[-1]
- if qua in (ZEROoMORE, ONEoZERO):
- # dam greed first, same code as below in *-loops again
- if tix<tkL-1:
- n = try_index(sarg, cmpi, tokens[tix+1:], nested)
- if n: return n
- ctk=ctk[:-1]
+ ctk, qua = tokens[tix]
+ if qua and tix<tkL-1: # dam greed first, same code as below in *-loops again
+ n = try_index(sarg, cmpi, tokens[tix+1:], nested)
+ if n: return n
if nested:
if ctk[0] == GROUPA:
for subtk in TokenListCache[ctk]:
subcmpi = try_index(sarg, cmpi, subtk, True)
if subcmpi: break
else:
- if qua in (ZEROoMORE, ONEoZERO) or ctk.startswith(EXTENSION+'!'):
- continue
+ if qua or ctk.startswith(EXTENSION+'!'): continue
else: return 0
if ctk.startswith(EXTENSION+'='): continue
elif ctk.startswith(EXTENSION+'!'): return 0
- if qua == ZEROoMORE:
- while subcmpi:
+ if qua==STAR:
+ while subcmpi and subcmpi<L:
if tix<tkL-1:
n = try_index(sarg, subcmpi, tokens[tix+1:], nested)
if n: return n
@@ -131,30 +127,28 @@
if subcmpi: break
else: break
if not ctk.startswith(EXTENSION+':'): xGroups.append(sarg[cmpi:subcmpi])
- cmpi=subcmpi; continue
-# any group token done, back reference:
+ cmpi=subcmpi; continue # any group token done, back reference:
elif ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
i=int(ctk[1]); captured=xGroups[i]; lg=len(captured)
if sarg[cmpi:cmpi+lg]==captured:
cmpi+=lg
- if qua==ZEROoMORE:
- while sarg[cmpi:cmpi+lg]==captured:
+ if qua==STAR:
+ while sarg[cmpi:cmpi+lg]==captured and cmpi<L:
if tix<tkL-1:
- n = try_index(sarg, cmpi, tokens[tix+1:], nested)
+ n = try_index(sarg, cmpi, tokens[tix+1:], True)
if n: return n
cmpi+=lg
- elif qua not in (ZEROoMORE, ONEoZERO): return 0
- continue
-# from here on tix points to a simple token:
- if match_simple_token(sarg[cmpi], ctk):
+ elif not qua: return 0
+ continue # from here on tix points to a simple token:
+ if match_simple_token(sarg[cmpi], ctk):
cmpi+=1
- if qua==ZEROoMORE:
- while cmpi<len(sarg) and match_simple_token(sarg[cmpi], ctk):
+ if qua==STAR:
+ while match_simple_token(sarg[cmpi], ctk) and cmpi<L:
if tix<tkL-1:
n = try_index(sarg, cmpi, tokens[tix+1:], nested)
if n: return n
cmpi+=1
- elif qua not in (ONEoZERO, ZEROoMORE): return 0
+ elif not qua: return 0
return cmpi
def xsearch(sarg, pattern, nested=False, start=0):