A simple 3 function utility library for downloading, parsing and capitalizing ISO 3166 country names.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | #!/usr/bin/env python
"""
A small utility library for processing ISO 3166 country codes.
References
ISO 3166 home page URL :-
http://www.iso.org/iso/country_codes.htm
ISO 3166-1-alpha-2 text file URL :-
http://www.iso.org/iso/list-en1-semic-3.txt
"""
import os
import urllib2
#-----------------------------------------------------------------------------
def get_lastest_iso3166(filename=None, url=None):
"""
Retrieves the latest ISO 3166-1 alpha-2 country code text file from the
iso.org website.
Saves URL contents to filename. Defaults to the current working directory
and uses the basename from URL if not specified.
Assumes the default ISO 3166 URL if not specified.
"""
# Pick default URL if not specified.
if url is None:
url='http://www.iso.org/iso/list-en1-semic-3.txt'
print 'downloading %s' % url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
if filename is None:
save_path = os.path.dirname(__file__)
basename = os.path.basename(response.geturl())
filename = os.path.join(save_path, basename)
fh = open(filename, 'wb')
fh.write(response.read())
fh.close()
#-----------------------------------------------------------------------------
def capitalize_country_name(cname):
"""Fixes capitalization edge cases in ISO 3166 country names"""
# Replace some non-ASCII character codes with ASCII equivalents.
cname = cname.replace("\xc9", 'e')
cname = cname.replace("\xd4", '0')
cname = cname.replace("\xc5", 'a')
tokens = [t.capitalize() for t in cname.split()]
for (i, t) in enumerate(tokens):
# General cases.
if t.upper().startswith('(U.'):
tokens[i] = t.upper()
elif t[0] == '(':
tokens[i] = '(' + t[1:].capitalize()
elif '-' in t:
tokens[i] = '-'.join([e.capitalize() for e in t.split('-')])
# Some annoying special cases :-)
if t.lower() in ('of', 'and', 'the', 'former'):
tokens[i] = t.lower()
elif t == "D'ivoire":
tokens[i] = "D'Ivoire"
elif t == "Mcdonald":
tokens[i] = "McDonald"
elif t.upper() == "U.S.":
tokens[i] = "U.S."
return ' '.join(tokens)
#-----------------------------------------------------------------------------
def iter_capitalized_iso3166(filename):
"""
A generator function that parses the free ISO 3166-1 alpha-2 country
codes text file returning a tuple containing the alpha-2 code and country
name (in that order).
Country names are capitalized correctly (taking edge cases into account).
Example :-
('AF', 'Afghanistan')
('AX', 'Aland Islands')
('AL', 'Albania')
('DZ', 'Algeria')
('AS', 'American Samoa')
...
References
ISO 3166 home page URL :-
http://www.iso.org/iso/country_codes.htm
ISO 3166-1-alpha-2 text file URL :-
http://www.iso.org/iso/list-en1-semic-3.txt
"""
line_count = 0
for line in open(filename):
line_count += 1
if line_count > 2:
cname, cc = line.strip().split(';')
yield (cc, capitalize_country_name(cname))
#-----------------------------------------------------------------------------
if __name__ == '__main__':
get_lastest_iso3166()
for record in iter_capitalized_iso3166('list-en1-semic-3.txt'):
print '%s - %s' % record
|
Does what it says on the tin.
This was knocked really quickly but should save others some time and makes country names in your applications look neat and tidy. Handles a bunch of irksome edge cases.