Welcome, guest | Sign In | My Account | Store | Cart

There are a several ways to represent collections of key/value pairs in XML, which makes it more difficult than necessary to use that data in python. By taking advantage of common patterns in the XML it is easy to turn most formats into collections of lists and dicts.

Python, 137 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re
from elementtree import ElementTree as ET

def main():

# Some sample data from http://developer.yahoo.com/maps/rest/V1/geocode.html
    yahoo_geocode_test = """\
<?xml version="1.0" encoding="UTF-8"?>
<ResultSet xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="urn:yahoo:maps"
xsi:schemaLocation="urn:yahoo:maps http://api.local.yahoo.com/MapsService/V1/GeocodeResponse.xsd">
  <Result precision="address">
    <Latitude>37.416384</Latitude>
    <Longitude>-122.024853</Longitude>
    <Address>701 FIRST AVE</Address>
    <City>SUNNYVALE</City>
    <State>CA</State>
    <Zip>94089-1019</Zip>
    <Country>US</Country>
  </Result>
</ResultSet>
    """

    # The "ResultSet" element should be treated like a list
    # The "Result" element should be treated like a map, with the
    # child elements converted to key/value pairs
    tag_convert = {"ResultSet": list_of_children,
                   "Result": children_are_mapping}

    doc = ET.fromstring(yahoo_geocode_test)
    
    xdata = XMLDataExtractor(tag_convert,
                             no_ns=True, downcase=True)

    result = xdata.from_elem(doc)

    from pprint import pprint
    pprint(result)


"""
Result:

[{'address': '701 FIRST AVE',
  'city': 'SUNNYVALE',
  'country': 'US',
  'latitude': '37.416384',
  'longitude': '-122.024853',
  'state': 'CA',
  'zip': '94089-1019'}]
"""


def identity(trans, elem):
    """Return 'elem' unchanged"""
    return elem

def attr_are_mapping(trans, elem):
    """The attributes of 'elem' contain it's key/value pairs"""
    return dict(elem.attrib)


def list_of_children(trans, elem):
    """Child elements of 'elem' are returned as a list"""
    return map(trans.from_elem, elem)

def children_are_mapping(trans, elem):
    """Child elements of elem are the key/value pairs.  tag name is
    key, value is inner text"""
    
    res = {}
    for i in elem:
        key = trans.tagnorm(i)
        if len(i):
            value = trans.from_elem(i)
        else:
            value = i.text

        res[key] =  value
        
    return res

def children_and_attr(trans, elem):
    """Child elements of 'elem', as well as it's attributes, as the
    resulting key/value pairs"""
    res = children_are_mapping(trans, elem)
    res.update(elem.attrib)
    return res


class XMLDataExtractor:

    STRIP_NS = re.compile(r"{.*}")
    
    def __init__(self, tag_convert, no_ns=False, downcase=False):
        """
        tag_convert: a map from tag names to conversion functions
        no_ns: if True, ignore namespaces
        downcase: downcase all resulting tag names
        """
        
        self.no_ns = no_ns
        self.downcase = downcase

        tag_convert_norm = {}
        for k,v in tag_convert.items():
            tag_convert_norm[self.tagnorm(k)] = v
        self.tag_convert = tag_convert_norm

    def from_elem(self, elem):
        "Convert this element to a useful datastructure"
        fn = self.tag_convert.get(self.tagnorm(elem), identity)
        return fn(self, elem)

    def tagnorm(self, tag_or_elem):
        """Normalize the tag name, optionally stripping namespaces and
        downcasing.  'elem' may be an Element or a string"""

        if ET.iselement(tag_or_elem):
            tag = tag_or_elem.tag
        else:
            tag = tag_or_elem
            
        if self.no_ns:
            res = self.STRIP_NS.sub('', tag)
        else:
            res = tag

        if self.downcase:
            res = res.lower()

        return res

        


if __name__ == "__main__": main()

Since there is no standard way to represent a mapping (collection of key/value pairs) in XML, consumers of XML web services are left with the equally unpalatable options of either using some heavyweight schema language or manually extracting the data.

If we identify some common patterns in XML documents it is possible to trivially extract useful dicts and lists from a variety of different formats. The patterns that seem most common are:

  1. Child elements are key/value pairs, with the inner text element as the value and the tag as the key. Example: Yahoo Geocoding API
  2. An tag's attributes are the key value pairs. Example: del.icio.us API
  3. A combination of #1 and #2. Example: Flickr API

This solution generalizes to many data formats. For example, with flickr the set of tag converters would be:

tags_convert = {"rsp": list_of_children,
                "person": children_and_attr,
                "photos": children_are_mapping}

And for del.icio.us:

tags_convert = {"posts": list_of_children,
                "post": attr_are_mapping}