Welcome, guest | Sign In | My Account | Store | Cart

Normalise a URL by collapsing case where appropriate, trimming '..' path segments, etc.

Python, 146 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
'''urlnorm.py
(c) 1999 Mark Nottingham <mnot@pobox.com>

Routines to normalise a URL.

* lowercases the scheme and hostname
* takes out default port if present (e.g., http://www.foo.com:80/)
* collapses the path (./, ../, etc)
* removes the last character in the hostname if it is '.'
* unquotes any %-escaped characters

available functions:

norms - given a URL (string), returns a normalised URL
norm - given a URL tuple, returns a normalised tuple
test - test suite
'''

# THIS SOFTWARE IS PROVIDED "AS IS", WITH NO WARRANTY OF ANY KIND
# some stuff stolen from RFC 1808.

from urlparse import urlparse, urlunparse
from urllib import unquote
from string import rfind, lower
import re


__version__ = "0.9"

_re_pat = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)')


_default_port = {	'http': ':80',
					'https': ':443',
					'gopher': ':70',
					'news': ':119',
					'snews': ':563',
					'nntp': ':119',
					'snntp': ':563',
					'ftp': ':21',
					'telnet': ':23',
					'prospero': ':191',
				}

_relative_scheme = {	'http': 1,
						'https': 1,
						'news': 1,
						'snews': 1,
						'nntp': 1,
						'snntp': 1,
						'ftp': 1,
						'file': 1,
						'': 1
					}



def norms(urlstring=''):
	return urlunparse(norm(urlparse(urlstring)))


def norm(urltuple=('','','','','','')):
	(scheme, host, path, parameters, query, fragment) = urltuple
	scheme = lower(scheme)
	host = lower(host)
	
	if _relative_scheme.get(scheme, 0):
		last_path = path
		while 1:
			path = re.sub(_re_pat, '/', path, 1)
			if last_path == path:
				break
			last_path = path
			
	path = unquote(path)

	colon_idx = rfind(host, ':')
	if colon_idx > 0:
		if host[colon_idx:] == _default_port.get(scheme, '#'):
			host = host[:colon_idx]
		if host[colon_idx - 1] == '.':
			host = host[:colon_idx - 1] + host[colon_idx:]
	else:
		try:
			if host[-1] == '.':
				host = host[:-1]
		except IndexError:
			pass

	return (scheme, host, path, parameters, query, fragment)



def test():
	tests = {	'/foo/bar/.':				'/foo/bar/', 
			'/foo/bar/./':				'/foo/bar/',
			'/foo/bar/..':				'/foo/',
			'/foo/bar/../': 			'/foo/',
			'/foo/bar/../baz': 			'/foo/baz',
			'/foo/bar/../..': 			'/',
			'/foo/bar/../../': 			'/',
			'/foo/bar/../../baz': 		'/baz',
			'/foo/bar/../../../baz':	'/../baz',
			'/foo/bar/../../../../baz':	'/baz',
			'/./foo':				'/foo',
			'/../foo':			'/../foo',
			'/foo.':				'/foo.',
			'/.foo':				'/.foo',
			'/foo..':				'/foo..',
			'/..foo':				'/..foo',
			'/./../foo':				'/../foo',
			'/./foo/.':				'/foo/',
			'/foo/./bar':				'/foo/bar',
			'/foo/../bar':				'/bar',
			'/foo//':				'/foo/',
			'/foo///bar//':				'/foo/bar/',	
			'http://www.foo.com:80/foo':	'http://www.foo.com/foo',
			'http://www.foo.com:8000/foo':	'http://www.foo.com:8000/foo',
			'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
			'http://www.foo.com.:81/foo':	'http://www.foo.com:81/foo',
			'http://www.foo.com/%7ebar':	'http://www.foo.com/~bar',
			'http://www.foo.com/%7Ebar':	'http://www.foo.com/~bar',
			'ftp://user:pass@ftp.foo.net/foo/bar': 'ftp://user:pass@ftp.foo.net/foo/bar',
				'-':						'-',
			}

	n_correct, n_fail = 0, 0
	test_keys = tests.keys()
	test_keys.sort()		
	
	for i in test_keys:
		print 'ORIGINAL:', i
		cleaned = norms(i)

		answer = tests[i]
		print 'CLEANED: ', cleaned
		print 'CORRECT: ', answer
		if cleaned != answer:
			print "*** TEST FAILED"
			n_fail = n_fail + 1
		else:
			n_correct = n_correct + 1
		print
		
	print "TOTAL CORRECT:", n_correct
	print "TOTAL FAILURE:", n_fail

2 comments

Ryan Williams 22 years, 9 months ago  # | flag

Sequence instead of Dictionary? Instead of _relative_scheme being a dictionary, it might be simpler to make it a sequence ("_relative_scheme = ['http', 'https', ...").

The line "if _relative_scheme.get(scheme, 0):" could then be replaced with the prettier "if scheme in _relative_scheme:".

Alex Martelli 22 years, 6 months ago  # | flag

best of both worlds. in Python 2.2, you can use 'in' with a dictionary RHS operand, so getting both the nice syntax AND the dictionary's speed.