def match_region(ch, pattern):
""" region: a list of comparation chars and ranges in [] or [^]"""
if pattern[1]=='^': booly = False; booln = True; i=2
else: booly = True; booln = False; i=1
while i < len(pattern):
if pattern[i] == '\\' and pattern[i+1] in (']', '\\','-'):
if ch== pattern[i+1]: return booly
else: i+=2
elif i<len(pattern)-2 and pattern[i+1]=='-':
if pattern[i]<=ch<=pattern[i+2]: return booly
else: i+= 2
elif pattern[i]==ch: return booly
else: i+=1
return booln
def match_token(ch, token):
c0 = token[0]
if c0=='[': return match_region(ch,token)
elif c0=='.': return True
elif c0=='\\':
c1=token[1]
if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
else: return (ch==c1)
else: return (ch==c0)
TokenListCache = {} # essential with nesting, otherwise only here for optimization
xGroups = []
def parse_pattern(pattern, nested):
"""
tokens are:
1. patterns included in brackets (parsing is recursive)
2. regions
3. \\ together with the escaped character
4. periods
5. simple characters
All with eventually trailing quantifiers
"""
tokens = []; i=0
while i < len(pattern):
c = pattern[i]
if c=='[':
k = pattern.find(']', i)
if k==-1: raise ValueError('Unmatched [ in '+pattern)
while pattern[k-1] == '\\':
k = pattern.find(']', k+1)
if k==-1: raise ValueError('Unmatched [ in '+pattern)
tokens.append(pattern[i:k+1])
i=k+1
elif c == '.':
tokens.append('.'); i+=1
elif c == '\\':
if i<len(pattern)-1:
tokens.append(pattern[i:i+2]); i+=2
else:
raise ValueError('Bad successor of a backslash')
elif nested and c=='(':
resu = '('; k=i+1; lv=1
while lv > 0:
cc = pattern[k]
if cc =='\\': resu+=cc; resu+=pattern[k+1]; k+=2; continue
if cc == '(': lv+=1
elif cc == ')': lv-=1
resu+=cc; k+=1
i=k
if i<len(pattern) and pattern[i] in ('*', '+', '?'):
resu+=pattern[i]; i+=1
if resu not in TokenListCache.keys():
TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
elif resu not in TokenListCache.keys():
TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
tokens.append(resu)
continue
else:
tokens.append(c); i+=1
if i<len(pattern) and pattern[i] in ('*', '+', '?'):
tokens[-1] += pattern[i]; i+=1
TokenListCache[pattern]=tokens
return tokens
def try_index(sarg, start, tokens, nested):
compix = start; tix = 0; tkL = len(tokens)
while tix<tkL:
if compix == len(sarg): return 0
ctk = tokens[tix]; qua=ctk[-1]
if nested :
if ctk[0] == '(':
subtokens = TokenListCache[ctk]
subcompix = try_index(sarg, compix, subtokens, True)
bak = subcompix
if not subcompix:
if qua in ('?', '*'): tix+=1; continue
else: return 0
elif qua in ('*', '+'):
while subcompix:
bak = subcompix
subcompix = try_index(sarg, subcompix, subtokens, True)
tix+=1
xGroups.append(sarg[compix:bak])
compix=bak
continue
if ctk[0]=='\\' and '0'<=ctk[1]<='9':
i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
if sarg[compix:compix+lg]==captured:
compix+=lg
if qua in ('+', '*'):
while sarg[compix:compix+lg]==captured: compix+=lg
tix+=1; continue
elif qua in ('?','*'): tix+=1; continue
else: return 0
if qua=='?':
if match_token(sarg[compix], ctk): tix+=1; compix+=1
continue
if qua=='+':
if not match_token(sarg[compix], ctk): return 0
compix+=1
if qua in ('*','+'):
if tix<tkL-1:
while compix<len(sarg) and match_token(sarg[compix], ctk) and\
not match_token(sarg[compix], tokens[tix+1]):
compix+=1
else:
while compix<len(sarg) and match_token(sarg[compix], ctk):
compix+=1
tix+=1
else:
if not match_token(sarg[compix], ctk): return 0
compix+=1; tix+=1
return compix
def xsearch(sarg, pattern, start=0, nested=False):
try: tokens = TokenListCache[pattern]
except: tokens = parse_pattern(pattern, nested)
argix=start
while argix<len(sarg):
if nested: global xGroups; xGroups=[]
n = try_index(sarg, argix, tokens, nested)
if n: return (argix, n)
argix += 1
return (0,0)
def xplit(sarg, pattern, nested=False):
resu = []; xpair = xsearch(sarg, pattern, nested=nested); residue=0
while (xpair[0]):
resu.append(sarg[:xpair[0]])
residue = xpair[1]
xpair = xsearch(sarg, pattern, xpair[1], nested)
return resu+sarg[residue:]
def xreplace(sarg, pattern, subst, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
while (xpair[0]):
to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
if nested:
for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]
return s
def xfuncreplace(sarg, pattern, f, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
while (xpair[0]):
to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
for xpair in reversed(to_replace):
s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]
return s
Diff to Previous Revision
--- revision 5 2010-06-02 09:59:56
+++ revision 6 2010-06-02 19:04:12
@@ -14,17 +14,18 @@
return booln
def match_token(ch, token):
- if token.startswith('['): return match_region(ch,token)
- elif token.startswith('\\'):
- tkc=token[1]
- if tkc=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
- elif tkc=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
- else: return (ch==tkc)
- elif token=='.': return True
- else: return (ch==token[0])
+ c0 = token[0]
+ if c0=='[': return match_region(ch,token)
+ elif c0=='.': return True
+ elif c0=='\\':
+ c1=token[1]
+ if c1=='s': return (ch in (' ','\t','\r','\n','\f','\v'))
+ elif c1=='S': return (ch not in (' ','\t','\r','\n','\f','\v'))
+ else: return (ch==c1)
+ else: return (ch==c0)
TokenListCache = {} # essential with nesting, otherwise only here for optimization
-to_escape = ('.','*','+','?','\\','[',']'); esc_brackets = to_escape + ('(', ')')
+xGroups = []
def parse_pattern(pattern, nested):
"""
@@ -34,12 +35,9 @@
3. \\ together with the escaped character
4. periods
5. simple characters
- 6. quantifiers, only following tokens 1-5
+ All with eventually trailing quantifiers
"""
- if nested: ESC = esc_brackets
- else: ESC = to_escape
- tokens = []
- i = 0
+ tokens = []; i=0
while i < len(pattern):
c = pattern[i]
if c=='[':
@@ -53,7 +51,7 @@
elif c == '.':
tokens.append('.'); i+=1
elif c == '\\':
- if i<len(pattern)-1 and pattern[i+1] in ESC:
+ if i<len(pattern)-1:
tokens.append(pattern[i:i+2]); i+=2
else:
raise ValueError('Bad successor of a backslash')
@@ -65,16 +63,19 @@
if cc == '(': lv+=1
elif cc == ')': lv-=1
resu+=cc; k+=1
- if resu not in TokenListCache.keys():
- TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
+ i=k
+ if i<len(pattern) and pattern[i] in ('*', '+', '?'):
+ resu+=pattern[i]; i+=1
+ if resu not in TokenListCache.keys():
+ TokenListCache[resu]=parse_pattern(resu[1:-2], nested=True)
+ elif resu not in TokenListCache.keys():
+ TokenListCache[resu]=parse_pattern(resu[1:-1], nested=True)
tokens.append(resu)
- if k<len(pattern) and pattern[k] in ('*', '+', '?'):
- tokens.append(pattern[k]); k+=1
- i = k+1
+ continue
else:
tokens.append(c); i+=1
if i<len(pattern) and pattern[i] in ('*', '+', '?'):
- tokens.append(pattern[i]); i+=1
+ tokens[-1] += pattern[i]; i+=1
TokenListCache[pattern]=tokens
return tokens
@@ -82,40 +83,50 @@
compix = start; tix = 0; tkL = len(tokens)
while tix<tkL:
if compix == len(sarg): return 0
- ctk = tokens[tix]
- if nested and ctk[0] == '(':
- subtokens = TokenListCache[ctk]
- subcompix = try_index(sarg, compix, subtokens, True)
- c = 'n'
- if tix<tkL-1: c = tokens[tix+1]
- if not subcompix:
- if c in ('?', '*'): tix+=2; compix=subcompix; continue
+ ctk = tokens[tix]; qua=ctk[-1]
+ if nested :
+ if ctk[0] == '(':
+ subtokens = TokenListCache[ctk]
+ subcompix = try_index(sarg, compix, subtokens, True)
+ bak = subcompix
+ if not subcompix:
+ if qua in ('?', '*'): tix+=1; continue
+ else: return 0
+ elif qua in ('*', '+'):
+ while subcompix:
+ bak = subcompix
+ subcompix = try_index(sarg, subcompix, subtokens, True)
+ tix+=1
+ xGroups.append(sarg[compix:bak])
+ compix=bak
+ continue
+ if ctk[0]=='\\' and '0'<=ctk[1]<='9':
+ i = int(ctk[1]); captured=xGroups[i]; lg = len(captured)
+ if sarg[compix:compix+lg]==captured:
+ compix+=lg
+ if qua in ('+', '*'):
+ while sarg[compix:compix+lg]==captured: compix+=lg
+ tix+=1; continue
+ elif qua in ('?','*'): tix+=1; continue
else: return 0
- elif c in ('*', '+'):
- while subcompix:
- compix = subcompix
- subcompix = try_index(sarg, compix, subtokens, nested)
- tix+=2; continue
- qua = 'n'
- if tix<tkL-1: qua = tokens[tix+1]
if qua=='?':
- if match_token(sarg[compix], ctk): tix+=2; compix+=1
+ if match_token(sarg[compix], ctk): tix+=1; compix+=1
continue
if qua=='+':
if not match_token(sarg[compix], ctk): return 0
compix+=1
if qua in ('*','+'):
- if tix<tkL-2:
+ if tix<tkL-1:
while compix<len(sarg) and match_token(sarg[compix], ctk) and\
- not match_token(sarg[compix], tokens[tix+2]):
+ not match_token(sarg[compix], tokens[tix+1]):
compix+=1
else:
while compix<len(sarg) and match_token(sarg[compix], ctk):
compix+=1
- tix+=2
+ tix+=1
else:
if not match_token(sarg[compix], ctk): return 0
- else: compix+=1; tix+=1
+ compix+=1; tix+=1
return compix
def xsearch(sarg, pattern, start=0, nested=False):
@@ -123,6 +134,7 @@
except: tokens = parse_pattern(pattern, nested)
argix=start
while argix<len(sarg):
+ if nested: global xGroups; xGroups=[]
n = try_index(sarg, argix, tokens, nested)
if n: return (argix, n)
argix += 1
@@ -136,18 +148,19 @@
xpair = xsearch(sarg, pattern, xpair[1], nested)
return resu+sarg[residue:]
-def xreplace(sarg, pattern, f, nested=False):
+def xreplace(sarg, pattern, subst, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
while (xpair[0]):
to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
- for xpair in reversed(to_replace):
- s = s[:xpair[0]]+f+s[xpair[1]:]
+ if nested:
+ for i in range(10): subst=subst.replace('\\'+str(i), xGroups[i])
+ for xpair in reversed(to_replace): s = s[:xpair[0]]+subst+s[xpair[1]:]
return s
def xfuncreplace(sarg, pattern, f, nested=False):
to_replace=[]; s=sarg; xpair=xsearch(sarg, pattern, nested=nested)
while (xpair[0]):
to_replace.append(xpair); xpair=xsearch(sarg, pattern, xpair[1], nested)
- for xpair in reversed(to_replace):
+ for xpair in reversed(to_replace):
s[:xpair[0]]+f(s[xpair[0]:xpair[1]])+s[xpair[1]:]
return s