Welcome, guest | Sign In | My Account | Store | Cart

This is a set of classes I have used for dealing with URL parsing in an easy and convenient manner. It keeps the code clean, etc.

Python, 118 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from urlparse import urlparse

class Authority(dict):
    '''
    >>> a = Authority('username:asecret@host.com:443')
    >>> a.user
    'username'
    >>> a.password
    'asecret'
    >>> a.host
    'host.com'
    >>> a.port
    443

    # You can also access these like a dict, if you prefer
    >>> a['user']
    'username'
    >>> a['password']
    'asecret'

    # What about an authority section  with no password?
    >>> a = Authority('username@host.com:443')
    >>> a.user
    'username'
    >>> a.password # no return value

    # What about an empty password?
    >>> a = Authority('username:@host.com:443')
    >>> a.user
    'username'
    >>> a.password
    ''

    # A simpler authority:
    >>> a = Authority('host.com:8080')
    >>> a.host
    'host.com'
    >>> a.port
    8080
    '''
    def __init__(self, auth):
        self.auth = auth
        auths = auth.split('@')
        if len(auths) == 2:
            userpass = auths.pop(0)
            userpass = userpass.split(':')
            self.user = userpass.pop(0)
            try:
                self.password = userpass.pop(0)
            except:
                self.password = None
        else:
            self.user = self.password = None
        hostport = auths[0].split(':')
        self.host = hostport.pop(0)
        try:
            self.port = int(hostport.pop(0))
        except:
            self.port = None
        attrs = [ (attr, getattr(self, attr)) 
            for attr in ('user', 'password', 'host', 'port') ]
        self.update(dict(attrs))
            

class UriQuery(dict):
    '''
    >>> query_string = 'num=100&q=twisted+python&btnG=Search'
    >>> query = UriQuery(query_string)
    >>> query['q']
    'twisted+python'
    >>> k = query.keys()
    >>> k.sort();k
    ['btnG', 'num', 'q']
    >>> v = query.values()
    >>> v.sort();v
    ['100', 'Search', 'twisted+python']
    '''
    def __init__(self, query):
        self.query = query
        if query:
            query_dict = dict([x.split('=') for x in query.split('&') ])
            self.update(query_dict)
            for key, val in query_dict.items():
                setattr(self, key, val)

class Uri(object):
    '''
    >>> url = 'ftp://username:asecret@ftp.funet.fi/pub/NeXT'
    >>> uri = Uri(url)
    >>> (uri.scheme, uri.user, uri.password, uri.host, uri.port, uri.path)
    ('ftp', 'username', 'asecret', 'ftp.funet.fi', None, '/pub/NeXT')

    >>> url = 'http://www.google.com/search?num=100&q=twisted+python&btnG=Search'
    >>> uri = Uri(url)
    >>> (uri.scheme, uri.path)
    ('http', '/search')
    >>> uri.query.q
    'twisted+python'
    '''
    def __init__(self, uri):
        (self.scheme, netloc, path, self.params, query, 
            self.fragment) = urlparse(uri)

        self.authority = Authority(netloc)
        self.path = path or '/'
        self.query = UriQuery(query)
        # for convenience:
        self.user = self.authority.user
        self.password = self.authority.password
        self.host = self.authority.host
        self.port = self.authority.port

def _test():
    import doctest, uri
    doctest.testmod(uri)

if __name__ == '__main__':
    _test()

The doctests should really explain everything.

Also, there is every chance that I have violated one aspect or another of the RFC. Use at your own risk.

See http://www.faqs.org/rfcs/rfc2396.html for more info on the URI spec.