1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149 | import re
class WordsToNumbers():
"""A class that can translate strings of common English words that
describe a number into the number described
"""
# a mapping of digits to their names when they appear in the
# relative "ones" place (this list includes the 'teens' because
# they are an odd case where numbers that might otherwise be called
# 'ten one', 'ten two', etc. actually have their own names as single
# digits do)
__ones__ = { 'one': 1, 'eleven': 11,
'two': 2, 'twelve': 12,
'three': 3, 'thirteen': 13,
'four': 4, 'fourteen': 14,
'five': 5, 'fifteen': 15,
'six': 6, 'sixteen': 16,
'seven': 7, 'seventeen': 17,
'eight': 8, 'eighteen': 18,
'nine': 9, 'nineteen': 19 }
# a mapping of digits to their names when they appear in the 'tens'
# place within a number group
__tens__ = { 'ten': 10,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90 }
# an ordered list of the names assigned to number groups
__groups__ = { 'thousand': 1000,
'million': 1000000,
'billion': 1000000000,
'trillion': 1000000000000 }
# a regular expression that looks for number group names and captures:
# 1-the string that preceeds the group name, and
# 2-the group name (or an empty string if the
# captured value is simply the end of the string
# indicating the 'ones' group, which is typically
# not expressed)
__groups_re__ = re.compile(
r'\s?([\w\s]+?)(?:\s((?:%s))|$)' %
('|'.join(__groups__))
)
# a regular expression that looks within a single number group for
# 'n hundred' and captures:
# 1-the string that preceeds the 'hundred', and
# 2-the string that follows the 'hundred' which can
# be considered to be the number indicating the
# group's tens- and ones-place value
__hundreds_re__ = re.compile(r'([\w\s]+)\shundred(?:\s(.*)|$)')
# a regular expression that looks within a single number
# group that has already had its 'hundreds' value extracted
# for a 'tens ones' pattern (ie. 'forty two') and captures:
# 1-the tens
# 2-the ones
__tens_and_ones_re__ = re.compile(
r'((?:%s))(?:\s(.*)|$)' %
('|'.join(__tens__.keys()))
)
def parse(self, words):
"""Parses words to the number they describe"""
# to avoid case mismatch, everything is reduced to the lower
# case
words = words.lower()
# create a list to hold the number groups as we find them within
# the word string
groups = {}
# create the variable to hold the number that shall eventually
# return to the caller
num = 0
# using the 'groups' expression, find all of the number group
# an loop through them
for group in WordsToNumbers.__groups_re__.findall(words):
## determine the position of this number group
## within the entire number
# assume that the group index is the first/ones group
# until it is determined that it's a higher group
group_multiplier = 1
if group[1] in WordsToNumbers.__groups__:
group_multiplier = WordsToNumbers.__groups__[group[1]]
## determine the value of this number group
# create the variable to hold this number group's value
group_num = 0
# get the hundreds for this group
hundreds_match = WordsToNumbers.__hundreds_re__.match(group[0])
# and create a variable to hold what's left when the
# "hundreds" are removed (ie. the tens- and ones-place values)
tens_and_ones = None
# if there is a string in this group matching the 'n hundred'
# pattern
if hundreds_match is not None and hundreds_match.group(1) is not None:
# multiply the 'n' value by 100 and increment this group's
# running tally
group_num = group_num + \
(WordsToNumbers.__ones__[hundreds_match.group(1)] * 100)
# the tens- and ones-place value is whatever is left
tens_and_ones = hundreds_match.group(2)
else:
# if there was no string matching the 'n hundred' pattern,
# assume that the entire string contains only tens- and ones-
# place values
tens_and_ones = group[0]
# if the 'tens and ones' string is empty, it is time to
# move along to the next group
if tens_and_ones is None:
# increment the total number by the current group number, times
# its multiplier
num = num + (group_num * group_multiplier)
continue
# look for the tens and ones ('tn1' to shorten the code a bit)
tn1_match = WordsToNumbers.__tens_and_ones_re__.match(tens_and_ones)
# if the pattern is matched, there is a 'tens' place value
if tn1_match is not None:
# add the tens
group_num = group_num + WordsToNumbers.__tens__[tn1_match.group(1)]
# add the ones
if tn1_match.group(2) is not None:
group_num = group_num + WordsToNumbers.__ones__[tn1_match.group(2)]
else:
# assume that the 'tens and ones' actually contained only the ones-
# place values
group_num = group_num + WordsToNumbers.__ones__[tens_and_ones]
# increment the total number by the current group number, times
# its multiplier
num = num + (group_num * group_multiplier)
# the loop is complete, return the result
return num
if __name__ == "__main__":
# here is an example you can use to test the results
nums = [
"one", "twenty six", "one hundred", "five thousand nineteen",
"one hundred forty", "three hundred forty five",
"sixteen hundred", "one thousand six hundred",
"fifty five thousand", "fifty five thousand six",
"six hundred twenty three million twenty one thousand forty one"
]
wtn = WordsToNumbers()
for num in nums:
print num, ": ", wtn.parse(num)
|