Welcome, guest | Sign In | My Account | Store | Cart

I needed a quick hack to extract some data from MS InfoPath files stored on a file server. Infopath data files are stored as XML so it was fairly straight forward to extract the required information. The outputed file is report.csv and a log file is called reportlog.txt. It reads settings from config.ini.

Python, 87 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from xml.dom import minidom
from os import walk
import ConfigParser
import csv
from string import split
from string import strip
from time import gmtime, localtime, asctime
from os.path import join, dirname, exists, abspath

#log errors stuff
from traceback import format_exception
from sys import exc_info


#this is the log file - made global so all the subroutines can see it
global mylog

#small app to suck InfoPath data files into Excel csv file


def get_fields():
    global mylog
    mylog.writelines(".. parsing config \n")
    fields =[]
    cp = ConfigParser.ConfigParser()
    cp.read("config.ini")
    fields = split(cp.get("InfoPath","fields"),",")
    path = cp.get("InfoPath","path")
 

    return fields, path


def read_file(fields, path, writer):
    global mylog
    #read and write out the files
    for root, dirs, files in walk(path):
        for filename in files:
            if ".xml" in filename:
                abspath = join(root, filename)
                try:
		    mylog.write("\n" + abspath + "\n")
                    f = open(abspath,'r')
                    dom = minidom.parseString(f.read())
                    row = []            

                    for field in fields:
			try:
                            data = dom.getElementsByTagName(strip(field))[0].childNodes[0].data

			    data.encode('utf-8') #put your code set here
			except:
                            mylog.write("...error on " + field + "\n")
                            mylog.write(''.join(format_exception(*exc_info()))) 

			    data = " "

                        row.append(data)


                    writer.writerow(row)
                    f.close()
                except:
                    txt =  ''.join(format_exception(*exc_info()))
                    mylog.write(txt + "\n") 

                
def create_log ():
    global mylog
    logname = "reportlog.txt"  
    time_now = asctime(localtime())
    
    mylog = open(logname, 'w')
    mylog.writelines(time_now + ".. starting \n")
    return	

if __name__=="__main__":

    #create the log file
    create_log()
    #get the settings from config.ini in same dir
    fields, path = get_fields()
    #open csv and write out header fields
    writer = csv.writer(open("report.csv", "wb"))
    writer.writerow(fields)
    #read files and output Excel csv
    read_file(fields, path, writer)

The script uses an INI file to store the search path and fields that will be extracted. The format of the INI file is something like this:

[InfoPath]

fields= my:Issue, my:LOB, my:SARepresentative, my:Assessor, my:Status, my:Type path = c:\temp\InfoPath

I used py2exe to generate an executable for this using python 2.4 on windows. You could quite easily open up the resulting file in Excel directly.

Anthony http://xminc.com/mt/