# You can edit these constants. You might want to avoid backslash inflation
# or to avoid introduction of still more brackets when analysing LISP-code incidentally
ANYCHAR = '.'
ESCAPE = '\\'
REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
GROUPA = '(' ; GROUPO = ')'
ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
ALTERNATIVE = '|'
EXTENSION = '(?'
def match_region(ch, pattern):
""" region: a list of comparation chars and ranges in [] or [^]"""
if pattern[1]==COMPLEMENT: booly = False; booln = True; i=2
else: booly = True; booln = False; i=1
while i < len(pattern):
if pattern[i] == ESCAPE and pattern[i+1] in (REGIONO, ESCAPE, RANGE):
if ch== pattern[i+1]: return booly
else: i+=2
elif i<len(pattern)-2 and pattern[i+1]==RANGE:
if pattern[i]<=ch<=pattern[i+2]: return booly
else: i+= 2
elif pattern[i]==ch: return booly
else: i+=1
return booln
def match_simple_token(ch, token):
c0 = token[0]
if c0==REGIONA: return match_region(ch,token)
elif c0==ANYCHAR: return True
elif c0==ESCAPE:
c1=token[1]
if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
else: return (ch==c1)
else: return (ch==c0)
TokenListCache = {} # essential with nesting, otherwise only here for optimization
xGroups = []
def parse_pattern(pattern, nested, by_recursion = False):
"""
tokens are:
1. patterns included in brackets (parsing is recursive)
2. regions
3. \\ together with the escaped character
4. periods
5. simple characters
All with eventually trailing quantifiers
"""
tokens = []; i=0
while i < len(pattern):
c = pattern[i]
if c==REGIONA:
k = pattern.find(REGIONO, i)
if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
while pattern[k-1] == ESCAPE:
k = pattern.find(']', k+1)
if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
tokens.append(pattern[i:k+1])
i=k+1
elif c == '.':
tokens.append(ANYCHAR); i+=1
elif c == ESCAPE:
if i<len(pattern)-1:
tokens.append(pattern[i:i+2]); i+=2
else:
raise ValueError('Trailing '+ESCAPE)
elif nested and c==GROUPA:
resu = GROUPA; k=i+1; lv=1
while lv > 0:
cc = pattern[k]
if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
if cc == GROUPA: lv+=1
elif cc == GROUPO: lv-=1
resu+=cc; k+=1
i=k; corro=0; corra=0
if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE):
resu+=pattern[i]; i+=1; corro=1
for s in (':','!','='):
if resu.startswith(EXTENSION+s): corra = 1+len(s)
if resu not in TokenListCache.keys():
TokenListCache[resu]=parse_pattern(resu[1+corra:-(1+corro)], True, True)
tokens.append(resu)
continue
else:
tokens.append(c); i+=1
if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE):
tokens[-1] += pattern[i]; i+=1
if pattern not in TokenListCache.keys() and not by_recursion:
TokenListCache[pattern]=tokens
return tokens
def listsplit(larg, item):
resu = []; ix = 0
while ix<len(larg) and item in larg[ix:]:
ixx = larg.index(item); resu.append(larg[ix:ixx]); ix=ixx+1
resu.append(larg[ix:])
return resu
def match_token(sarg, compix, token):
if not token[0]=='(': return match_simple_token(sarg[compix], token)
subtokens = listsplit(TokenListCache[token], ALTERNATIVE)
for i in range(len(subtokens)):
subcompix = try_index(sarg, compix, subtokens[i], True)
if subcompix: return True
return False
def try_index(sarg, start, tokens, nested):
compix = start; tix = 0; tkL = len(tokens)
while tix<tkL:
if compix == len(sarg):
for s in tokens[tix:]:
if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
else: return compix
ctk = tokens[tix]; qua=ctk[-1]
if nested:
if ctk[0] == GROUPA:
subtokens = listsplit(TokenListCache[ctk], ALTERNATIVE)
for i in range(len(subtokens)):
subcompix = try_index(sarg, compix, subtokens[i], True)
if subcompix: break
else:
if qua in (ONEoZERO, ZEROoMORE) or ctk.startswith(EXTENSION+'!'):
tix+=1; continue
else: return 0
if ctk.startswith(EXTENSION+'='): tix+=1; continue
elif ctk.startswith(EXTENSION+'!'): return 0
if qua in (ZEROoMORE, ONEoMORE):
while subcompix and not match_token(sarg, compix, tokens[tix+1]):
for i in range(len(subtokens)):
subcompix = try_index(sarg, subcompix, subtokens[i], True)
if subcompix: break
if not ctk.startswith( EXTENSION+':' ):
xGroups.append(sarg[compix:subcompix])
compix=subcompix; tix+=1; continue
if ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
if sarg[compix:compix+lg]==captured:
compix+=lg
if qua in (ZEROoMORE, ONEoMORE):
while sarg[compix:compix+lg]==captured: compix+=lg
tix+=1; continue
elif qua in (ONEoZERO, ZEROoMORE): tix+=1; continue
else: return 0
if qua==ONEoZERO:
if match_simple_token(sarg[compix], ctk): compix+=1
tix+=1; continue
if qua==ONEoMORE:
if not match_simple_token(sarg[compix], ctk): return 0
compix+=1
if qua in (ZEROoMORE, ONEoMORE):
if tix<tkL-1:
while compix<len(sarg) and match_simple_token(sarg[compix], ctk) and\
not match_token(sarg, compix, tokens[tix+1]):
compix+=1
else:
while compix<len(sarg) and match_simple_token(sarg[compix], ctk):
compix+=1
tix+=1
else:
if not match_simple_token(sarg[compix], ctk): return 0
compix+=1; tix+=1
return compix
def xsearch(sarg, pattern, nested=False, start=0):
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
argix=start
while argix<len(sarg):
if nested: global xGroups; xGroups=[]
n = try_index(sarg, argix, tokens, nested)
if n: return (argix, n)
argix += 1
return ()
def xfinditer(sarg, pattern, nested=False, start=-1):
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
print(TokenListCache)
while start<len(sarg):
start += 1
if nested: global xGroups; xGroups=[]
n = try_index(sarg, start, tokens, nested)
if n: yield (start, n)
raise StopIteration()
def xmatch(sarg, pattern, nested=False):
""" checks, whether sarg as the whole matches the pattern """
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
n = try_index(sarg, 0, tokens, nested)
if n==len(sarg): return True
return False
def xsplit(sarg, pattern, nested=False):
resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
while xpair:
resu.append(sarg[:xpair[0]])
residue = xpair[1]
xpair = xsearch(sarg, pattern, nested, xpair[1])
return resu+sarg[residue:]
def xreplace(sarg, pattern, subst, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
if nested:
for i in range(10): subst=subst.replace(ESCAPE+str(i), xGroups[i])
for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]
return s
def xfuncreplace(sarg, pattern, f, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested)
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
for xpair in reversed(to_replace):
s = s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]
return s
Diff to Previous Revision
--- revision 12 2010-06-04 12:58:51
+++ revision 13 2010-06-05 00:31:10
@@ -1,23 +1,34 @@
+# You can edit these constants. You might want to avoid backslash inflation
+# or to avoid introduction of still more brackets when analysing LISP-code incidentally
+
+ANYCHAR = '.'
+ESCAPE = '\\'
+REGIONA = '['; REGIONO = ']'; RANGE = '-'; COMPLEMENT = '^'
+GROUPA = '(' ; GROUPO = ')'
+ONEoZERO = '?' ; ONEoMORE = '+' ; ZEROoMORE = '*'
+ALTERNATIVE = '|'
+EXTENSION = '(?'
+
def match_region(ch, pattern):
""" region: a list of comparation chars and ranges in [] or [^]"""
- if pattern[1]=='^': booly = False; booln = True; i=2
+ if pattern[1]==COMPLEMENT: booly = False; booln = True; i=2
else: booly = True; booln = False; i=1
while i < len(pattern):
- if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
+ if pattern[i] == ESCAPE and pattern[i+1] in (REGIONO, ESCAPE, RANGE):
if ch== pattern[i+1]: return booly
else: i+=2
- elif i<len(pattern)-2 and pattern[i+1]=='-':
+ elif i<len(pattern)-2 and pattern[i+1]==RANGE:
if pattern[i]<=ch<=pattern[i+2]: return booly
else: i+= 2
elif pattern[i]==ch: return booly
else: i+=1
return booln
-def match_token(ch, token):
+def match_simple_token(ch, token):
c0 = token[0]
- if c0=='[': return match_region(ch,token)
- elif c0=='.': return True
- elif c0=='\\':
+ if c0==REGIONA: return match_region(ch,token)
+ elif c0==ANYCHAR: return True
+ elif c0==ESCAPE:
c1=token[1]
if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
@@ -27,7 +38,7 @@
TokenListCache = {} # essential with nesting, otherwise only here for optimization
xGroups = []
-def parse_pattern(pattern, nested):
+def parse_pattern(pattern, nested, by_recursion = False):
"""
tokens are:
1. patterns included in brackets (parsing is recursive)
@@ -40,42 +51,44 @@
tokens = []; i=0
while i < len(pattern):
c = pattern[i]
- if c=='[':
- k = pattern.find(']', i)
- if k==-1: raise ValueError('Unmatched [ in '+pattern)
- while pattern[k-1] == '\\':
+ if c==REGIONA:
+ k = pattern.find(REGIONO, i)
+ if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
+ while pattern[k-1] == ESCAPE:
k = pattern.find(']', k+1)
- if k==-1: raise ValueError('Unmatched [ in '+pattern)
+ if k==-1: raise ValueError('Unmatched '+REGIONA+' in '+pattern)
tokens.append(pattern[i:k+1])
i=k+1
elif c == '.':
- tokens.append('.'); i+=1
- elif c == '\\':
+ tokens.append(ANYCHAR); i+=1
+ elif c == ESCAPE:
if i<len(pattern)-1:
tokens.append(pattern[i:i+2]); i+=2
else:
- raise ValueError('Trailing backslash')
- elif nested and c=='(':
- resu = '('; k=i+1; lv=1
+ raise ValueError('Trailing '+ESCAPE)
+ elif nested and c==GROUPA:
+ resu = GROUPA; k=i+1; lv=1
while lv > 0:
cc = pattern[k]
- if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
- if cc == '(': lv+=1
- elif cc == ')': lv-=1
+ if cc == ESCAPE: resu+=cc; resu+=pattern[k+1]; k+=2; continue
+ if cc == GROUPA: lv+=1
+ elif cc == GROUPO: lv-=1
resu+=cc; k+=1
i=k; corro=0; corra=0
- if i<len(pattern) and pattern[i] in ('*', '+', '?'):
- resu+=pattern[i]; i+=1; corro=-1
- if resu.startswith('(?:'): corra=2
+ if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE):
+ resu+=pattern[i]; i+=1; corro=1
+ for s in (':','!','='):
+ if resu.startswith(EXTENSION+s): corra = 1+len(s)
if resu not in TokenListCache.keys():
- TokenListCache[resu]=parse_pattern(resu[1+corra:-1+corro], True)
+ TokenListCache[resu]=parse_pattern(resu[1+corra:-(1+corro)], True, True)
tokens.append(resu)
continue
else:
tokens.append(c); i+=1
- if i<len(pattern) and pattern[i] in ('*', '+', '?'):
+ if i<len(pattern) and pattern[i] in (ONEoZERO, ONEoMORE, ZEROoMORE):
tokens[-1] += pattern[i]; i+=1
- TokenListCache[pattern]=tokens
+ if pattern not in TokenListCache.keys() and not by_recursion:
+ TokenListCache[pattern]=tokens
return tokens
def listsplit(larg, item):
@@ -85,56 +98,68 @@
resu.append(larg[ix:])
return resu
+def match_token(sarg, compix, token):
+ if not token[0]=='(': return match_simple_token(sarg[compix], token)
+ subtokens = listsplit(TokenListCache[token], ALTERNATIVE)
+ for i in range(len(subtokens)):
+ subcompix = try_index(sarg, compix, subtokens[i], True)
+ if subcompix: return True
+ return False
+
def try_index(sarg, start, tokens, nested):
compix = start; tix = 0; tkL = len(tokens)
while tix<tkL:
if compix == len(sarg):
for s in tokens[tix:]:
- if s[-1] not in ('?','*'): return 0
+ if s[-1] not in (ONEoZERO, ZEROoMORE): return 0
else: return compix
ctk = tokens[tix]; qua=ctk[-1]
if nested:
- if ctk[0] == '(':
- subtokens = listsplit(TokenListCache[ctk], '|')
+ if ctk[0] == GROUPA:
+ subtokens = listsplit(TokenListCache[ctk], ALTERNATIVE)
for i in range(len(subtokens)):
subcompix = try_index(sarg, compix, subtokens[i], True)
if subcompix: break
else:
- if qua in ('?', '*'): tix+=1; continue
- else: return 0
- if qua in ('*', '+'):
- while subcompix:
+ if qua in (ONEoZERO, ZEROoMORE) or ctk.startswith(EXTENSION+'!'):
+ tix+=1; continue
+ else: return 0
+ if ctk.startswith(EXTENSION+'='): tix+=1; continue
+ elif ctk.startswith(EXTENSION+'!'): return 0
+ if qua in (ZEROoMORE, ONEoMORE):
+ while subcompix and not match_token(sarg, compix, tokens[tix+1]):
for i in range(len(subtokens)):
subcompix = try_index(sarg, subcompix, subtokens[i], True)
if subcompix: break
- if not ctk.startswith('(?:'): xGroups.append(sarg[compix:subcompix])
+ if not ctk.startswith( EXTENSION+':' ):
+ xGroups.append(sarg[compix:subcompix])
compix=subcompix; tix+=1; continue
- if ctk[0]=='\\' and '0'<= ctk[1] <='9':
+ if ctk[0]==ESCAPE and '0'<= ctk[1] <='9':
i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
if sarg[compix:compix+lg]==captured:
compix+=lg
- if qua in ('+', '*'):
+ if qua in (ZEROoMORE, ONEoMORE):
while sarg[compix:compix+lg]==captured: compix+=lg
tix+=1; continue
- elif qua in ('?','*'): tix+=1; continue
+ elif qua in (ONEoZERO, ZEROoMORE): tix+=1; continue
else: return 0
- if qua=='?':
- if match_token(sarg[compix], ctk): compix+=1
+ if qua==ONEoZERO:
+ if match_simple_token(sarg[compix], ctk): compix+=1
tix+=1; continue
- if qua=='+':
- if not match_token(sarg[compix], ctk): return 0
+ if qua==ONEoMORE:
+ if not match_simple_token(sarg[compix], ctk): return 0
compix+=1
- if qua in ('*','+'):
+ if qua in (ZEROoMORE, ONEoMORE):
if tix<tkL-1:
- while compix<len(sarg) and match_token(sarg[compix], ctk) and\
- not match_token(sarg[compix], tokens[tix+1]):
+ while compix<len(sarg) and match_simple_token(sarg[compix], ctk) and\
+ not match_token(sarg, compix, tokens[tix+1]):
compix+=1
else:
- while compix<len(sarg) and match_token(sarg[compix], ctk):
+ while compix<len(sarg) and match_simple_token(sarg[compix], ctk):
compix+=1
tix+=1
else:
- if not match_token(sarg[compix], ctk): return 0
+ if not match_simple_token(sarg[compix], ctk): return 0
compix+=1; tix+=1
return compix
@@ -149,6 +174,17 @@
argix += 1
return ()
+def xfinditer(sarg, pattern, nested=False, start=-1):
+ try: tokens = TokenListCache[pattern]
+ except: tokens = parse_pattern(pattern, nested)
+ print(TokenListCache)
+ while start<len(sarg):
+ start += 1
+ if nested: global xGroups; xGroups=[]
+ n = try_index(sarg, start, tokens, nested)
+ if n: yield (start, n)
+ raise StopIteration()
+
def xmatch(sarg, pattern, nested=False):
""" checks, whether sarg as the whole matches the pattern """
try: tokens = TokenListCache[pattern]
@@ -170,7 +206,7 @@
while xpair:
to_replace.append(xpair); xpair=xsearch(sarg, pattern, nested, xpair[1])
if nested:
- for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
+ for i in range(10): subst=subst.replace(ESCAPE+str(i), xGroups[i])
for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]
return s