This script acts like xml2. It transforms a XML file into a flat text output, with XPath-like syntax, one line per XML node or attribute. This format is more suitable for working with standard unix CLI utils (sed, grep, ... etc).
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | #!/usr/bin/python
# Import
import xml.etree.ElementTree as ET
import sys
def removeNS(tag) :
if tag.find('}') == -1 :
return tag
else:
return tag.split('}', 1)[1]
def linearize(el, path) :
# Print text value if not empty
text = el.text.strip()
if text == "" :
print path
else :
# Several lines ?
lines = text.splitlines()
if len(lines) > 1 :
lineNb=1
for line in lines :
print path + "[line %d]=%s " % (lineNb, line)
lineNb += 1
else :
print path + "=" + text
# Print attributes
for name, val in el.items() :
print path + "/@" + removeNS(name) + "=" + val
# Counter on the sibbling element names
counters = {}
# Loop on child elements
for childEl in el :
# Remove namespace
tag = removeNS(childEl.tag)
# Tag name already encountered ?
if counters.has_key(tag) :
counters[tag] += 1
# Number it
numberedTag = tag + "[" + str(counters[tag]) + "]"
else :
counters[tag] = 1
numberedTag = tag
# Print child node recursively
linearize(childEl, path + '/' + numberedTag)
# Main
def process(stream, prefix) :
# Parse the XML
tree = ET.parse(stream)
# Get root element
root = tree.getroot()
# Linearize
linearize(root, prefix + "//" + removeNS(root.tag))
# Each argument is a file
args = sys.argv[1:]
# Loop on files
for filename in args :
# Open the file
file = open(filename)
# If we process several files, prefix each one with its path
if len(args) > 1 :
prefix = filename + ":"
else:
prefix = ""
# Process it
process(file, prefix)
# No input file ? => Proces std input
if len(args) == 0 :
process(sys.stdin, "")
|
Usage
> flatten-xml.py <file1.xml> <file2.xml> ...
If no argument is provided, the XML stream in read from standard input. If several files are provided, the path of the current file is used as a prefix in each line.
Example
Here is a sample XML file
<?xml version="1.0"?>
<root>
<list>
<item>Value 1</item>
<item>Value 2</item>
<item>Value 3</item>
<item>Value 4</item>
</list>
<foo bar="barbar" attr="attr-value" >Some text</foo>
<longtext>
Blab bla bla bla
bla bli bla
</longtext>
</root>
And the corresponding result:
//root
//root/list
//root/list/item=Value 1
//root/list/item[2]=Value 2
//root/list/item[3]=Value 3
//root/list/item[4]=Value 4
//root/foo=Some text
//root/foo/@bar=barbar
//root/foo/@attr=attr-value
//root/longtext[line 1]=Blab bla bla bla
//root/longtext[line 2]= bla bli bla
Note that multiline text values are prefixed with a line number. Also, sibling elements with the same tag name are numbered.
Known bugs
- Namespaces are ignored (stripped)
- The leading and trailing spaces of text values are stripped