I was once asked to explain how the following regular expression works which formats any integer with commas for every thousand (or group of 3 digits):
(\d)(?=(\d{3})+$)
Example:
>>> import re
>>> re.sub("(\d)(?=(\d{3})+$)", "\\1,", "1234")
'1,234'
So here is an implementation of the above regular expression (as best as I could over a lunch break) that will hopefully highlight how a regular expression engine and finite automa work.
Comments and feedback welcome!
--JamesMills / prologic
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | #!/usr/bin/env python
"""Format Number separating groups of 3 digits
This implements a simple state machine that implements the following
regular expression (which achieve the same goal):
(\d)(?=(\d{3})+$)
"""
def get_groups(s):
test = lambda x: len(x) == 3 and "".join(x).isdigit()
groups = [s[i:][:3] for i in range(0, len(s), 3)]
if all(map(test, groups)):
return groups
def generate_format(s):
i = 0
N = len(s)
while i < N:
if s[i].isdigit():
groups = get_groups(s[(i + 1):])
if groups:
yield s[i]
for group in groups:
yield ","
yield group
break
else:
yield s[i]
i += 1
def format(x):
s = str(x)
if "." in s:
i, f = s.split(".", 1)
if i.isdigit() and f.isdigit():
return "%s.%s" % ("".join(generate_format(i)), f)
elif s.isdigit():
return "".join(generate_format(s))
print(format(1))
print(format(12))
print(format(123))
print(format(1234))
print(format(12345))
print(format(123456))
print(format(12345678))
print(format(123456789))
print(format(123456789.1234))
|
Tags: formatting, regular_expressions