Normalise a URL by collapsing case where appropriate, trimming '..' path segments, etc.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | '''urlnorm.py
(c) 1999 Mark Nottingham <mnot@pobox.com>
Routines to normalise a URL.
* lowercases the scheme and hostname
* takes out default port if present (e.g., http://www.foo.com:80/)
* collapses the path (./, ../, etc)
* removes the last character in the hostname if it is '.'
* unquotes any %-escaped characters
available functions:
norms - given a URL (string), returns a normalised URL
norm - given a URL tuple, returns a normalised tuple
test - test suite
'''
# THIS SOFTWARE IS PROVIDED "AS IS", WITH NO WARRANTY OF ANY KIND
# some stuff stolen from RFC 1808.
from urlparse import urlparse, urlunparse
from urllib import unquote
from string import rfind, lower
import re
__version__ = "0.9"
_re_pat = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)')
_default_port = { 'http': ':80',
'https': ':443',
'gopher': ':70',
'news': ':119',
'snews': ':563',
'nntp': ':119',
'snntp': ':563',
'ftp': ':21',
'telnet': ':23',
'prospero': ':191',
}
_relative_scheme = { 'http': 1,
'https': 1,
'news': 1,
'snews': 1,
'nntp': 1,
'snntp': 1,
'ftp': 1,
'file': 1,
'': 1
}
def norms(urlstring=''):
return urlunparse(norm(urlparse(urlstring)))
def norm(urltuple=('','','','','','')):
(scheme, host, path, parameters, query, fragment) = urltuple
scheme = lower(scheme)
host = lower(host)
if _relative_scheme.get(scheme, 0):
last_path = path
while 1:
path = re.sub(_re_pat, '/', path, 1)
if last_path == path:
break
last_path = path
path = unquote(path)
colon_idx = rfind(host, ':')
if colon_idx > 0:
if host[colon_idx:] == _default_port.get(scheme, '#'):
host = host[:colon_idx]
if host[colon_idx - 1] == '.':
host = host[:colon_idx - 1] + host[colon_idx:]
else:
try:
if host[-1] == '.':
host = host[:-1]
except IndexError:
pass
return (scheme, host, path, parameters, query, fragment)
def test():
tests = { '/foo/bar/.': '/foo/bar/',
'/foo/bar/./': '/foo/bar/',
'/foo/bar/..': '/foo/',
'/foo/bar/../': '/foo/',
'/foo/bar/../baz': '/foo/baz',
'/foo/bar/../..': '/',
'/foo/bar/../../': '/',
'/foo/bar/../../baz': '/baz',
'/foo/bar/../../../baz': '/../baz',
'/foo/bar/../../../../baz': '/baz',
'/./foo': '/foo',
'/../foo': '/../foo',
'/foo.': '/foo.',
'/.foo': '/.foo',
'/foo..': '/foo..',
'/..foo': '/..foo',
'/./../foo': '/../foo',
'/./foo/.': '/foo/',
'/foo/./bar': '/foo/bar',
'/foo/../bar': '/bar',
'/foo//': '/foo/',
'/foo///bar//': '/foo/bar/',
'http://www.foo.com:80/foo': 'http://www.foo.com/foo',
'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo',
'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo',
'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar',
'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar',
'ftp://user:pass@ftp.foo.net/foo/bar': 'ftp://user:pass@ftp.foo.net/foo/bar',
'-': '-',
}
n_correct, n_fail = 0, 0
test_keys = tests.keys()
test_keys.sort()
for i in test_keys:
print 'ORIGINAL:', i
cleaned = norms(i)
answer = tests[i]
print 'CLEANED: ', cleaned
print 'CORRECT: ', answer
if cleaned != answer:
print "*** TEST FAILED"
n_fail = n_fail + 1
else:
n_correct = n_correct + 1
print
print "TOTAL CORRECT:", n_correct
print "TOTAL FAILURE:", n_fail
|
also available at http://www.mnot.net/python/urlnorm.py
Tags: web
Sequence instead of Dictionary? Instead of _relative_scheme being a dictionary, it might be simpler to make it a sequence ("_relative_scheme = ['http', 'https', ...").
The line "if _relative_scheme.get(scheme, 0):" could then be replaced with the prettier "if scheme in _relative_scheme:".
best of both worlds. in Python 2.2, you can use 'in' with a dictionary RHS operand, so getting both the nice syntax AND the dictionary's speed.