# Copyright (C) Boris Kats 2011 - 2012.
import builtins
from datetime import date
from datetime import datetime as dt
def make_date(d):
index = d.find('T')
if index != -1:
d = d[:index]
parts = d.split('-')
dt_type='-'
if(len(parts)==1):
parts = d.split('/')
dt_type = '/'
if parts[1].isalpha():
month_f ='%b' if len(parts[1])==3 else '%B'
first_f ='%d'
second_f = '%Y' if len(parts[2])==4 else '%y'
else:
if int(parts[1])<=12:
month_f = '%m'
first_f = '%Y' if len(parts[0])==4 else '%d'
if ( first_f == '%Y'):
second_f = '%d'
else:
second_f = '%Y' if len(parts[2])==4 else '%y'
else:
first_f = '%m'
month_f = '%d'
second_f = '%Y' if len(parts[2])==4 else '%y'
d_format = dt_type.join([name for name in [first_f,month_f,second_f] ])
# print(d,d_format)
return dt.strptime(d,d_format).date()
def make_bool(boolean):
if boolean.isalpha():
return boolean.upper()=='TRUE'
else:
return boolean !='0'
__builtinTypes__ ={'int' : builtins.int, 'long' : builtins.int, 'float' : builtins.float,
'bool': make_bool, 'str' : builtins.str, 'double' : builtins.float,
'date' : make_date, 'string' : builtins.str}
__nodeTypes__ = ['list','dict', 'set', 'deque', 'frozenset', 'tuple']
__expectedTypes__ = list( set(__nodeTypes__) | set(__builtinTypes__.keys()))
__hashableTypes__ = ['str','string','date']
__translateArrays__={"int":'b',"long":'I',"float":'f'}
import collections
__xmlspecial__= collections.OrderedDict([("&","&"),("<","<"),(">",">"),('"',""")])
def strToXMLstr(word,table = __xmlspecial__):
for i, j in table.items() :
word = word.replace(i,j)
return word
def XMLstrTostr(word,table = __xmlspecial__):
for i, j in table.items() :
word = word.replace(j,i)
return word
import sys, string
from xml.dom import minidom, Node
import io
from types import *
import types
import array
from collections import deque
from keyword import iskeyword as iskeyword
class AnyXmlException(Exception):
def __init__(self,message):
super()
self.what = message
class anyxml(object):
'''The class, which populate himself from xml recursivly.
The several attempts were made to develop python class, which can do it
in generic way none of them succeeded completely for certain reason:
plain xml node datum are just strings and do not have any information about types of themself.
For example in xml <Address><HomeNumber>27</HomeNumber><Street>Mystreet<Street>...</Address>
both HomeNumber and Street values are plain strings. It is relatively simple to reflect xml into
python class, in which all members of class are strings. However the problem is still present:
What is the type of Address member of class? Is it namedtuple,dict,set,list? Such class became
almost unusable in user application. To produce the real python class from xml the information
about types of members of class has to be retrieved. The are several sources of that information
and the most generic way is to extract it from xml itself. The types of primitives can be deduced
from xml as string representation of primitive. For example:
"24" will became "int"; "3.75" => "float; "Mystreet" => str, "True" => bool and "2012-01-12" => date.
Additionally xml tag can have some properties,which can carry some info. For example above xml can be
<Address type="tuple"><HomeNumber>24</HomeNumber><Street>Mystreet<Street>... </Address>.
In that case, the resultant python class will have a namedtuple Address with members HomeNumber,
Street ... and each of those members in turn will be instances of certain type and ready to use.
The couple of other samples:
xml: <myVector type="list"><element1>0.1</element1> ....
<elementN>0.9</elementN></MyVector>
will came out in python class as MyVector as list of floats;
xml: <myMap type="dict"><1Month>0.25</1Month><2Month>0.35</2Month>.... </myMap>
will be translated into myMap as dictionary of string and float.Alternatively, with type="tuple"
myMap will became just named tuple ( or simple tuple if tag names are not unique)
Almost all known xml generator can produce "typed" xlm; however these types can have one feature -
"namespace" prefix, which make problem more complicated.
The class will accept any typed xml with strict rule ( nothing unreasonable):
All member should be bracketed with appropriate type property:
<member>I am string</member> or
<myVector type="set"><element1 >0.5</element1> <element1 >0.6</element1> ....
<elementN>0.7</elementN></MyVector>
The variables "member" (str type) and myVector (set type) will come to python class.
Tag names for elements of containers are irrelevant: they are nameless and
accessible in both languages by [] operator.
Sequence type with mixed types will be accepted in pythonic style.
On one very important issue - "namespace" in xml.
Often xml is coming from thrird party application with it's own namespaces. It is not practical
to deal with unknown properies of xml in that class. That problem can be addressed with
pre-processing of input xml as :
1. make the special dictionary to translate namespaces type into regular ones
translator = {"ns:vector":"list", "ns:struct:"tuple" ....}
2. read xml into single string and translate it with function strToXMLstr(xml,table= translator)
3. feed minidom parser with that string instead of file ( use doc = minidom.parseString(data) ).
Hopefully, this pre-processing procedure will help. With knowledge in advance names of variables
of your resultant class ( just inspecting input xml) and the decision how these variables should
come to resultant class one can add to translator dictionary some entries as:
"<my_var>":'<my_var type="dict">'.
In the real life sample the xml from US Department of Treasure is used to provide US Treasure
Yield Curve. The xml itself, fetched from url, is in terrible condition: meaningless names,
all kind of date formats("02-03-2012","01-FEB-12","01/23/12","11/28/06" are used simultaneously, etc ...
However, with little help of pre-processing, it is possible to convert that garbage into reasonable
python class. Actually, there are two different urls with two xml schema and in both cases it is possible
to retrieve datumm. Run appliation as python anyxml.py 02/07/2012 or 02/07/2011 for historic curve
Finally: the instance of resultant class is anyxml, but class iself will contain just
natural python' type members with some exception: the named tuples are user defined classes
and __class__ of those will be "anyxml.namedtuple", but __name__ is just "namedtuple".
To inspect the content of resultant object one can use nice function "total_size" in verbose
mode by Raymond Hettinger from http://code.activestate.com/recipes/577504/ or
function objwalk by Yaniv Aknin from: http://code.activestate.com/recipes/577982/.
Before using that class make sure that your input xml is valid: try to
open it in browser. '''
def __init__(self,root):
# Initially class does not contains any member attributes
super()
self.buildFromXmlFromList(root)
def nodecount(self,root,justElements = True):
if not justElements:
return len(root.childNodes)
count = 0
for child in root.childNodes:
if child.nodeType == Node.ELEMENT_NODE:
count +=1
return count
def make_list(self,root):
value = []
for node in root.childNodes:
if node.nodeType == Node.ELEMENT_NODE:
item = self.make(node)
if (item[0]==None):
continue
value.append(item[0])
return value,None
def make_set(self,root):
lst = self.make_list(root)
return set(lst[0]),None
def make_frozenset(self,root):
lst = self.make_list(root)
return frozenset(lst[0]),None
def make_deque(self,root):
lst = self.make_list(root)
return deque(lst[0]),None
def make_dict(self,root):
tpl = self.make_tuple(root)
tpl = tpl[0]
item = collections.OrderedDict()
if hasattr(tpl,'_asdict'):
names = [key for key in tpl._asdict().keys()]
count = 0
for t in tpl:
item[names[count]] = t
count +=1
return item,None
def make_tuple(self,root):
items = []
names = []
for node in root.childNodes:
if node.nodeType == Node.ELEMENT_NODE:
nodeName = node.nodeName
item = self.make(node)
if (item[0] == None):
continue
items.append(item[0])
if (item[1]):
nodeName = item[1]
if not nodeName[0].isalpha():
raise AnyXmlException('Badly formed xml: names in tuple or struct should be valid ')
if nodeName.find(':') !=-1:
nodeName = nodeName.replace(':','_')
if iskeyword(nodeName):
nodeName = nodeName[:1].upper()+ nodeName[1:]
if len(names)==0 or not nodeName in names:
names.append(nodeName)
if len(names) == 1: # this is unnamed tuple
return tuple(items)
if len(names) != len(set(names)):
raise AnyXmlException('Badly formed xml: names in tuple or struct should be unique')
tupleNames = ' '.join([name for name in names])
Temp = collections.namedtuple('namedtuple',tupleNames)
return Temp(*items),None
def make_primitive(self,root):
assert(self.nodecount(root)<=1)
content = self.getNodeContent(root)
if content==None: # it is just wrapper
if len(root.childNodes) == 0:
return None,None # <data unavailablbe/>
node = root.childNodes[0]
# print('make_primitive:','root',root.nodeName,'node',node.nodeName)
value = self.make(node)
return value[0],node.nodeName
nodeType= self.elementPattern(content)
return self.convert(nodeType,content),None
def getNodeContent(self,root):
if (len(root.childNodes) ==0):
if not hasattr(root.attributes,'get') or root.attributes.get('value',None) == None:
return None
value = root.attributes.get('value').value
value = value.strip()
else:
node = root.childNodes[0]
if node.nodeType != Node.TEXT_NODE:
return None
value = node.nodeValue.strip()
if len(value)== 0:
return None
return value
def elementPattern(self,content):
if content.isalpha():
contentUp = content.upper()
if contentUp=='TRUE' or contentUp=='FALSE':
return 'bool'
return 'str'
else:
index = content.find('T')
if index != -1:
input = content[:index]
else:
input = content
components = input.split('-')
if(len(components)==1):
components = input.split('/')
if (8<=len(input)<= 10) and len(components) == 3:
return 'date'
else:
try:
try_int = int(content)
return 'int'
except ValueError:
try:
try_flt = float(content)
return 'float'
except ValueError:
return 'str'
def make(self,root):
if self.nodecount(root)<=1:
item = getattr(self,'make_primitive')(root)
else:
# print('root.nodeName',root.nodeName,len(root.childNodes))
nodeType = root.attributes.get('type').value
item = getattr(self,'make_'+ nodeType)(root)
return item
def verifyNode(self,root):
if root.nodeType != Node.ELEMENT_NODE:
raise AnyXmlException('Badly formed xml: root has to be ELEMENT_NODE')
if root.attributes.get('type',None) == None:
raise AnyXmlException('Badly formed xml: node has to have attribute type '+ root.nodeName)
rootType = root.attributes.get('type').value
if rootType == None:
raise AnyXmlException('Badly formed xml: root attribute type cannot be empty')
if not rootType in __expectedTypes__:
raise AnyXmlException('Badly formed xml: unknown type: ' + rootType +' in node ' + root.nodeName)
return rootType
def buildFromXmlFromList(self,root):
for node in root.childNodes:
nodeKind = node.nodeType
if nodeKind == Node.ELEMENT_NODE:
nodeName = node.nodeName.strip()
value = self.make(node)
if(value[1]):
nodeName = value[1]
item = value[0]
if (item==None):
continue
if iskeyword(nodeName):
nodeName = nodeName[:1].upper()+ nodeName[1:]
setattr(self,nodeName,item)
def convert(self,typeof,input):
if typeof in __builtinTypes__.keys():
return __builtinTypes__[typeof](input)
raise AnyXmlException('Badly formed xml: unknown primitive type '+ typeof)
if __name__ =='__main__':
import sys,time
import urllib.request
args = sys.argv[1:]
if len(args) != 1 :
print ('usage: python anyxml.py date[mm/dd/yyyy], where date is valid business date')
sys.exit(-1)
inputDate = dt.strptime(args[0], '%m/%d/%Y').date()
today = date.today()
if ( inputDate > today):
print('Error:Input date should be in the past or today after 6:00PM')
sys.exit(-1)
usingHistory = False
url ='http://www.treasury.gov/resource-center/data-chart-center/interest-rates/Datasets/yield.xml'
url_2 ='http://www.treasury.gov/resource-center/data-chart-center/interest-rates/pages/XmlView.aspx?data=yieldyear&year='
if (inputDate.year != today.year or inputDate.month !=today.month):
usingHistory = True
url = url_2+ str(inputDate.year)
t1 = time.clock()
urlfile = urllib.request.urlopen(url)
data = urlfile.read().decode("utf8")
urlfile.close()
if (not usingHistory):
translationTable = {'<LIST_G_WEEK_OF_MONTH>':'<LIST_G_WEEK_OF_MONTH type="list">',
'<G_WEEK_OF_MONTH>':'<G_WEEK_OF_MONTH type="tuple">',
'<LIST_G_NEW_DATE>':'<LIST_G_NEW_DATE><NEW_DATES type="list">',
'<G_NEW_DATE>':'<G_NEW_DATE type="tuple">',
'<LIST_G_BC_CAT>':'<LIST_G_BC_CAT type="tuple">',
'</LIST_G_NEW_DATE>':'</NEW_DATES></LIST_G_NEW_DATE>',
'<G_BC_CAT>':'<G_BC_CAT type="dict">'}
else:
translationTable = {'<pre>':'<pre><data type="tuple">',
'</pre>':'</entries></data></pre>',
'<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" />':'<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" /><entries type="list">',
'<entry xmlns="http://www.w3.org/2005/Atom">':'<entry xmlns="http://www.w3.org/2005/Atom" type="tuple">',
'<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata">':'<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata" type="dict">'}
t2 = time.clock()
print('read data from url elapsed time:',t2-t1)
data = strToXMLstr(data,translationTable)
# debugFile = open('modtreasure.xml','w')
# debugFile.write('%s\n' % data)
# debugFile.close()
doc = minidom.parseString(data)
root = doc.documentElement
try:
t1 = time.clock()
document = anyxml(root)
if (not usingHistory):
curveFound = False
for week_of_month in document.LIST_G_WEEK_OF_MONTH:
for date_of_week in week_of_month.NEW_DATES:
if date_of_week.BID_CURVE_DATE == inputDate :
curveFound = True
print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
print('Tenor Yield')
for k,v in date_of_week.G_BC_CAT.items():
print(k[3:],"{0}{1}".format(v,'%'))
break
if not curveFound:
print('For',inputDate,'US Treasure Curve not found: is it business date?')
print('However, Curves are available for dates:')
for week_of_month in document.LIST_G_WEEK_OF_MONTH:
for date_of_week in week_of_month.NEW_DATES:
print(date_of_week.BID_CURVE_DATE)
else:
curveFound = False
for entry in document.data.entries:
tbl = entry.m_properties
if tbl['d_NEW_DATE'] == inputDate:
curveFound = True
print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
print('Tenor Yield')
for k,v in tbl.items():
if k.find('_BC_') !=-1:
print(k[5:],"{0}{1}".format(v,'%'))
break;
if not curveFound:
print('For',inputDate,'US Treasure Curve not found: is it business date?')
t1 = time.clock()-t1;
except AnyXmlException as err:
print(err.what)
print("parse document elapsed time:",t1)
Diff to Previous Revision
--- revision 5 2012-02-09 05:30:58
+++ revision 6 2012-02-09 07:33:28
@@ -4,6 +4,9 @@
from datetime import datetime as dt
def make_date(d):
+ index = d.find('T')
+ if index != -1:
+ d = d[:index]
parts = d.split('-')
dt_type='-'
if(len(parts)==1):
@@ -14,7 +17,7 @@
first_f ='%d'
second_f = '%Y' if len(parts[2])==4 else '%y'
else:
- if int(parts[1])<12:
+ if int(parts[1])<=12:
month_f = '%m'
first_f = '%Y' if len(parts[0])==4 else '%d'
if ( first_f == '%Y'):
@@ -124,7 +127,8 @@
Yield Curve. The xml itself, fetched from url, is in terrible condition: meaningless names,
all kind of date formats("02-03-2012","01-FEB-12","01/23/12","11/28/06" are used simultaneously, etc ...
However, with little help of pre-processing, it is possible to convert that garbage into reasonable
- python class. Run appliation as python anyxml.py 02/07/2012.
+ python class. Actually, there are two different urls with two xml schema and in both cases it is possible
+ to retrieve datumm. Run appliation as python anyxml.py 02/07/2012 or 02/07/2011 for historic curve
Finally: the instance of resultant class is anyxml, but class iself will contain just
natural python' type members with some exception: the named tuples are user defined classes
and __class__ of those will be "anyxml.namedtuple", but __name__ is just "namedtuple".
@@ -197,6 +201,8 @@
nodeName = item[1]
if not nodeName[0].isalpha():
raise AnyXmlException('Badly formed xml: names in tuple or struct should be valid ')
+ if nodeName.find(':') !=-1:
+ nodeName = nodeName.replace(':','_')
if iskeyword(nodeName):
nodeName = nodeName[:1].upper()+ nodeName[1:]
if len(names)==0 or not nodeName in names:
@@ -245,10 +251,15 @@
return 'bool'
return 'str'
else:
- components = content.split('-')
+ index = content.find('T')
+ if index != -1:
+ input = content[:index]
+ else:
+ input = content
+ components = input.split('-')
if(len(components)==1):
- components = content.split('/')
- if (8<=len(content)<= 10) and len(components) == 3:
+ components = input.split('/')
+ if (8<=len(input)<= 10) and len(components) == 3:
return 'date'
else:
try:
@@ -320,13 +331,6 @@
if (inputDate.year != today.year or inputDate.month !=today.month):
usingHistory = True
url = url_2+ str(inputDate.year)
-# print(url)
- if usingHistory :
- print('Historical datum are located at diffrent url with other xml schema')
- print('For time being use this application to retrive Treasure Curve for')
- print('any business date, which is in the same month as current month and')
- print('input date should be in the past(however in the same month) or today after 6:00PM')
- sys.exit(-1)
t1 = time.clock()
urlfile = urllib.request.urlopen(url)
data = urlfile.read().decode("utf8")
@@ -341,36 +345,52 @@
'<G_BC_CAT>':'<G_BC_CAT type="dict">'}
else:
translationTable = {'<pre>':'<pre><data type="tuple">',
- '</pre>':'</data></pre>'}
-
-
+ '</pre>':'</entries></data></pre>',
+ '<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" />':'<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" /><entries type="list">',
+ '<entry xmlns="http://www.w3.org/2005/Atom">':'<entry xmlns="http://www.w3.org/2005/Atom" type="tuple">',
+ '<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata">':'<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata" type="dict">'}
t2 = time.clock()
print('read data from url elapsed time:',t2-t1)
data = strToXMLstr(data,translationTable)
- debugFile = open('modtreasure.xml','w')
- debugFile.write('%s\n' % data)
- debugFile.close()
+# debugFile = open('modtreasure.xml','w')
+# debugFile.write('%s\n' % data)
+# debugFile.close()
doc = minidom.parseString(data)
root = doc.documentElement
try:
t1 = time.clock()
document = anyxml(root)
- curveFound = False
- for week_of_month in document.LIST_G_WEEK_OF_MONTH:
- for date_of_week in week_of_month.NEW_DATES:
- if date_of_week.BID_CURVE_DATE == inputDate :
- curveFound = True
- print("{} {} {}".format('For',date_of_week.BID_CURVE_DATE,'US Treasure Curve is:'))
- print('Tenor Yield')
- for k,v in date_of_week.G_BC_CAT.items():
- print(k[3:],"{0}{1}".format(v,'%'))
- break
- if not curveFound:
- print('For',inputDate,'US Treasure Curve not found: is it business date?')
- print('However, Curves are available for dates:')
+ if (not usingHistory):
+ curveFound = False
for week_of_month in document.LIST_G_WEEK_OF_MONTH:
for date_of_week in week_of_month.NEW_DATES:
- print(date_of_week.BID_CURVE_DATE)
+ if date_of_week.BID_CURVE_DATE == inputDate :
+ curveFound = True
+ print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
+ print('Tenor Yield')
+ for k,v in date_of_week.G_BC_CAT.items():
+ print(k[3:],"{0}{1}".format(v,'%'))
+ break
+ if not curveFound:
+ print('For',inputDate,'US Treasure Curve not found: is it business date?')
+ print('However, Curves are available for dates:')
+ for week_of_month in document.LIST_G_WEEK_OF_MONTH:
+ for date_of_week in week_of_month.NEW_DATES:
+ print(date_of_week.BID_CURVE_DATE)
+ else:
+ curveFound = False
+ for entry in document.data.entries:
+ tbl = entry.m_properties
+ if tbl['d_NEW_DATE'] == inputDate:
+ curveFound = True
+ print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
+ print('Tenor Yield')
+ for k,v in tbl.items():
+ if k.find('_BC_') !=-1:
+ print(k[5:],"{0}{1}".format(v,'%'))
+ break;
+ if not curveFound:
+ print('For',inputDate,'US Treasure Curve not found: is it business date?')
t1 = time.clock()-t1;
except AnyXmlException as err:
print(err.what)