I needed a quick hack to extract some data from MS InfoPath files stored on a file server. Infopath data files are stored as XML so it was fairly straight forward to extract the required information. The outputed file is report.csv and a log file is called reportlog.txt. It reads settings from config.ini.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | from xml.dom import minidom
from os import walk
import ConfigParser
import csv
from string import split
from string import strip
from time import gmtime, localtime, asctime
from os.path import join, dirname, exists, abspath
#log errors stuff
from traceback import format_exception
from sys import exc_info
#this is the log file - made global so all the subroutines can see it
global mylog
#small app to suck InfoPath data files into Excel csv file
def get_fields():
global mylog
mylog.writelines(".. parsing config \n")
fields =[]
cp = ConfigParser.ConfigParser()
cp.read("config.ini")
fields = split(cp.get("InfoPath","fields"),",")
path = cp.get("InfoPath","path")
return fields, path
def read_file(fields, path, writer):
global mylog
#read and write out the files
for root, dirs, files in walk(path):
for filename in files:
if ".xml" in filename:
abspath = join(root, filename)
try:
mylog.write("\n" + abspath + "\n")
f = open(abspath,'r')
dom = minidom.parseString(f.read())
row = []
for field in fields:
try:
data = dom.getElementsByTagName(strip(field))[0].childNodes[0].data
data.encode('utf-8') #put your code set here
except:
mylog.write("...error on " + field + "\n")
mylog.write(''.join(format_exception(*exc_info())))
data = " "
row.append(data)
writer.writerow(row)
f.close()
except:
txt = ''.join(format_exception(*exc_info()))
mylog.write(txt + "\n")
def create_log ():
global mylog
logname = "reportlog.txt"
time_now = asctime(localtime())
mylog = open(logname, 'w')
mylog.writelines(time_now + ".. starting \n")
return
if __name__=="__main__":
#create the log file
create_log()
#get the settings from config.ini in same dir
fields, path = get_fields()
#open csv and write out header fields
writer = csv.writer(open("report.csv", "wb"))
writer.writerow(fields)
#read files and output Excel csv
read_file(fields, path, writer)
|
The script uses an INI file to store the search path and fields that will be extracted. The format of the INI file is something like this:
[InfoPath]
fields= my:Issue, my:LOB, my:SARepresentative, my:Assessor, my:Status, my:Type path = c:\temp\InfoPath
I used py2exe to generate an executable for this using python 2.4 on windows. You could quite easily open up the resulting file in Excel directly.
Anthony http://xminc.com/mt/