Recipe 578032 revision 6 « ActiveState Code

# Copyright (C)  Boris Kats 2011 - 2012.
import builtins
from datetime import date
from datetime import datetime as dt

def make_date(d):
   index = d.find('T')
   if index != -1:
     d = d[:index] 
   parts = d.split('-')
   dt_type='-'
   if(len(parts)==1):
      parts = d.split('/')
      dt_type = '/'
   if parts[1].isalpha():
      month_f ='%b' if len(parts[1])==3 else '%B'
      first_f ='%d'
      second_f = '%Y' if len(parts[2])==4 else '%y'
   else:
      if int(parts[1])<=12:
         month_f = '%m'
         first_f = '%Y' if len(parts[0])==4 else '%d'
         if ( first_f == '%Y'):
            second_f = '%d'
         else:
            second_f = '%Y' if len(parts[2])==4 else '%y'
      else:
         first_f = '%m'
         month_f = '%d'
         second_f = '%Y' if len(parts[2])==4 else '%y'
   d_format = dt_type.join([name for name in [first_f,month_f,second_f] ])
#   print(d,d_format)
   return dt.strptime(d,d_format).date()

def make_bool(boolean):
   if boolean.isalpha():
      return boolean.upper()=='TRUE'
   else: 
      return  boolean !='0'

__builtinTypes__ ={'int' : builtins.int, 'long'   : builtins.int,  'float'  : builtins.float, 
                   'bool': make_bool,    'str'    : builtins.str,  'double' : builtins.float,
                   'date'   : make_date, 'string' : builtins.str}
__nodeTypes__  = ['list','dict', 'set', 'deque', 'frozenset', 'tuple']
__expectedTypes__ = list( set(__nodeTypes__) | set(__builtinTypes__.keys()))
__hashableTypes__ = ['str','string','date']
__translateArrays__={"int":'b',"long":'I',"float":'f'}

import collections
__xmlspecial__= collections.OrderedDict([("&","&amp"),("<","&lt;"),(">","&gt;"),('"',"&quot;")])

def strToXMLstr(word,table = __xmlspecial__):
    for i, j in table.items() : 
       word = word.replace(i,j)
    return word

def XMLstrTostr(word,table = __xmlspecial__):    
    for i, j in table.items() : 
       word = word.replace(j,i)
    return word

import sys, string
from xml.dom import minidom, Node
import io
from types import *
import types
import array
from collections import deque
from keyword import iskeyword as iskeyword

class AnyXmlException(Exception):
   def __init__(self,message):
      super()
      self.what = message

class anyxml(object):
    '''The class, which populate himself from xml recursivly.
       The several attempts were made to develop python class, which can do it
       in generic way none of them succeeded completely for certain reason:
       plain xml node datum are just strings and do not have any information about types of themself.
       For example in xml <Address><HomeNumber>27</HomeNumber><Street>Mystreet<Street>...</Address> 
       both HomeNumber and Street values are plain strings. It is relatively simple to reflect xml into
       python class, in which all members of class are strings. However the problem is still present:
       What is the type of Address member of class? Is it namedtuple,dict,set,list? Such class became 
       almost unusable in user application. To produce the real python class from xml the information
       about types of members of class has to be retrieved. The are several sources of that information
       and the most generic way is to extract it from xml itself. The types of primitives can be deduced
       from xml as string representation of primitive. For example:
       "24" will became "int"; "3.75" => "float; "Mystreet" => str, "True" => bool and "2012-01-12" => date.
       Additionally xml tag can have some properties,which can carry some info. For example above xml can be 
       <Address type="tuple"><HomeNumber>24</HomeNumber><Street>Mystreet<Street>... </Address>.
       In that case, the resultant python class will have a namedtuple Address with members HomeNumber, 
       Street ... and each of those members in turn will be instances of certain type and ready to use.
       The couple of other samples:
       xml: <myVector  type="list"><element1>0.1</element1> ....
                                  <elementN>0.9</elementN></MyVector>
       will came out in python class as MyVector as list of floats;
       xml:  <myMap type="dict"><1Month>0.25</1Month><2Month>0.35</2Month>.... </myMap>
       will be translated into myMap as dictionary of string and float.Alternatively, with type="tuple"
       myMap will became just named tuple ( or simple tuple if tag names are not unique)     
       
       Almost all known xml generator can produce "typed" xlm; however these types can have one feature -
       "namespace" prefix, which make problem more complicated.
       
       The class will accept any typed xml with strict rule ( nothing unreasonable):
          All member should be bracketed with appropriate type property:
          <member>I am string</member> or
          <myVector  type="set"><element1 >0.5</element1> <element1 >0.6</element1> ....
                                <elementN>0.7</elementN></MyVector>
          The variables "member" (str type) and myVector (set type) will come to python class.
          Tag names for elements of containers are irrelevant: they are nameless and 
          accessible in both languages by [] operator.  
       Sequence type  with mixed types will be accepted in pythonic style. 
       On one very important issue - "namespace" in xml. 
       Often xml is coming from thrird party application with it's own namespaces. It is not practical
       to deal with unknown properies of xml in that class. That problem can be addressed with 
       pre-processing of input xml as :
         1. make the special dictionary to translate namespaces type into regular ones
            translator = {"ns:vector":"list", "ns:struct:"tuple" ....}
         2. read xml into single string and translate it with function strToXMLstr(xml,table= translator)
         3. feed minidom parser with that string instead of file ( use doc = minidom.parseString(data) ).
       Hopefully, this pre-processing procedure will help. With knowledge in advance names of variables
       of your resultant class ( just inspecting input xml) and the decision how these variables should
       come to resultant class one can add to translator dictionary some entries as:
                           "<my_var>":'<my_var type="dict">'.
       In the real life sample the xml from US Department of Treasure is used to provide US Treasure
       Yield Curve. The xml itself, fetched from url, is in terrible condition: meaningless names,
       all kind of date formats("02-03-2012","01-FEB-12","01/23/12","11/28/06" are used simultaneously, etc ...
       However, with little help of pre-processing, it is possible to convert that garbage into reasonable
       python class. Actually, there are two different urls with two xml schema and in both cases it is possible
       to retrieve datumm. Run appliation as python anyxml.py 02/07/2012 or 02/07/2011 for historic curve     
       Finally: the instance of resultant class is anyxml, but class iself will contain just 
       natural python' type  members with some exception: the named tuples are user defined classes
       and __class__ of those will be "anyxml.namedtuple", but __name__ is just "namedtuple".
       To inspect the content of resultant object one can use nice function "total_size" in verbose
       mode by Raymond Hettinger from http://code.activestate.com/recipes/577504/ or 
       function objwalk by Yaniv Aknin from:   http://code.activestate.com/recipes/577982/.
       Before using that class make sure that your input xml is valid: try to
       open it in browser. '''

    def __init__(self,root):
#      Initially class does not contains any member attributes 
          super()
          self.buildFromXmlFromList(root)

    def nodecount(self,root,justElements = True):
        if not justElements:
           return len(root.childNodes)
        count = 0
        for child in root.childNodes:
           if child.nodeType == Node.ELEMENT_NODE:
              count +=1 
        return count
      
    def make_list(self,root):
       value = []
       for node in root.childNodes:
             if node.nodeType == Node.ELEMENT_NODE:       
                item = self.make(node)
                if (item[0]==None):
                   continue
                value.append(item[0])
       return value,None

 
    def make_set(self,root):
       lst = self.make_list(root)
       return set(lst[0]),None

    def make_frozenset(self,root):
       lst = self.make_list(root)
       return frozenset(lst[0]),None

    def make_deque(self,root):
       lst = self.make_list(root)
       return deque(lst[0]),None

    def make_dict(self,root):
        tpl = self.make_tuple(root)
        tpl = tpl[0]
        item = collections.OrderedDict()
        if hasattr(tpl,'_asdict'):
           names = [key for key in tpl._asdict().keys()]
           count = 0
           for t in tpl:
             item[names[count]] = t
             count +=1
        return item,None
  
    def make_tuple(self,root):
       items = []
       names = []
       for node in root.childNodes:
           if node.nodeType == Node.ELEMENT_NODE:
              nodeName = node.nodeName
              item = self.make(node)
              if (item[0] == None):
                 continue
              items.append(item[0])
              if (item[1]):
                 nodeName = item[1]             
              if not nodeName[0].isalpha():
                 raise AnyXmlException('Badly formed xml: names in tuple or struct should be valid ')
              if nodeName.find(':') !=-1:
                 nodeName = nodeName.replace(':','_') 
              if iskeyword(nodeName):
                 nodeName = nodeName[:1].upper()+ nodeName[1:] 
              if len(names)==0 or not nodeName in names: 
                 names.append(nodeName)
       if len(names) == 1:  # this is unnamed tuple
          return tuple(items)
       if len(names) != len(set(names)):
           raise AnyXmlException('Badly formed xml: names in tuple or struct should be unique')
       tupleNames = ' '.join([name for name in names])
       Temp = collections.namedtuple('namedtuple',tupleNames)
       return Temp(*items),None 
 

    def make_primitive(self,root):
        assert(self.nodecount(root)<=1)
        content = self.getNodeContent(root)
        if content==None:                        # it is just wrapper
           if len(root.childNodes) == 0:
              return None,None                   # <data unavailablbe/>
           node = root.childNodes[0]
#           print('make_primitive:','root',root.nodeName,'node',node.nodeName)
           value = self.make(node)
           return value[0],node.nodeName
        nodeType= self.elementPattern(content)
        return  self.convert(nodeType,content),None

    def getNodeContent(self,root):
       if (len(root.childNodes) ==0):
          if not hasattr(root.attributes,'get') or root.attributes.get('value',None) == None:
              return None 
          value = root.attributes.get('value').value 
          value = value.strip()
       else:   
          node = root.childNodes[0]
          if node.nodeType != Node.TEXT_NODE:
             return None
          value = node.nodeValue.strip()
       if len(value)== 0:  
          return None 
       return value
 
    def elementPattern(self,content):
        if content.isalpha():
           contentUp = content.upper()
           if contentUp=='TRUE' or contentUp=='FALSE':
              return 'bool'
           return 'str'  
        else:
           index = content.find('T')
           if index != -1:
              input = content[:index]
           else:
              input = content 
           components = input.split('-')  
           if(len(components)==1):
              components = input.split('/')      
           if (8<=len(input)<= 10)  and len(components) == 3:
              return 'date'
           else:
              try:
                 try_int = int(content)
                 return 'int'
              except ValueError:
                 try:
                    try_flt = float(content)
                    return 'float'
                 except ValueError:
                    return 'str'

    def make(self,root):
        if self.nodecount(root)<=1: 
           item = getattr(self,'make_primitive')(root)
        else:  
 #          print('root.nodeName',root.nodeName,len(root.childNodes))
           nodeType = root.attributes.get('type').value
           item = getattr(self,'make_'+ nodeType)(root)
        return item

    def verifyNode(self,root):
       if root.nodeType != Node.ELEMENT_NODE:
          raise AnyXmlException('Badly formed xml: root has to be ELEMENT_NODE')
       if root.attributes.get('type',None) == None:
          raise AnyXmlException('Badly formed xml: node has to have attribute type '+ root.nodeName) 
       rootType = root.attributes.get('type').value
       if rootType == None:
          raise AnyXmlException('Badly formed xml: root attribute type cannot be empty')
       if not rootType in __expectedTypes__:
          raise AnyXmlException('Badly formed xml: unknown type: ' + rootType +' in node ' + root.nodeName)
       return rootType
      
    def buildFromXmlFromList(self,root):
       for node in root.childNodes:
          nodeKind = node.nodeType
          if nodeKind == Node.ELEMENT_NODE:
             nodeName = node.nodeName.strip()
             value = self.make(node)
             if(value[1]):
                nodeName = value[1]
             item = value[0]
             if (item==None):
                continue
             if iskeyword(nodeName):
                nodeName = nodeName[:1].upper()+ nodeName[1:] 
             setattr(self,nodeName,item)
     
    def convert(self,typeof,input):
        if typeof in __builtinTypes__.keys():
           return __builtinTypes__[typeof](input)
        raise AnyXmlException('Badly formed xml: unknown primitive type '+ typeof)

if __name__ =='__main__':
   import sys,time
   import urllib.request
   args = sys.argv[1:]
   if len(args) != 1 :
      print ('usage: python anyxml.py date[mm/dd/yyyy], where date is valid business date')
      sys.exit(-1)
   inputDate = dt.strptime(args[0], '%m/%d/%Y').date()
   today = date.today()
   if ( inputDate > today):
       print('Error:Input date should be in the past or today after 6:00PM')
       sys.exit(-1)
   usingHistory = False
   url ='http://www.treasury.gov/resource-center/data-chart-center/interest-rates/Datasets/yield.xml'
   url_2 ='http://www.treasury.gov/resource-center/data-chart-center/interest-rates/pages/XmlView.aspx?data=yieldyear&year='
   if (inputDate.year != today.year or inputDate.month !=today.month):
      usingHistory = True
      url = url_2+ str(inputDate.year)
   t1 = time.clock()
   urlfile = urllib.request.urlopen(url)
   data = urlfile.read().decode("utf8")
   urlfile.close()
   if (not usingHistory):
      translationTable = {'<LIST_G_WEEK_OF_MONTH>':'<LIST_G_WEEK_OF_MONTH type="list">',
                          '<G_WEEK_OF_MONTH>':'<G_WEEK_OF_MONTH type="tuple">',
                          '<LIST_G_NEW_DATE>':'<LIST_G_NEW_DATE><NEW_DATES type="list">',
                          '<G_NEW_DATE>':'<G_NEW_DATE type="tuple">',
                          '<LIST_G_BC_CAT>':'<LIST_G_BC_CAT type="tuple">',
                          '</LIST_G_NEW_DATE>':'</NEW_DATES></LIST_G_NEW_DATE>',
                          '<G_BC_CAT>':'<G_BC_CAT type="dict">'}
   else:
      translationTable = {'<pre>':'<pre><data type="tuple">',
                          '</pre>':'</entries></data></pre>',
                          '<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" />':'<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" /><entries type="list">',        
                          '<entry xmlns="http://www.w3.org/2005/Atom">':'<entry xmlns="http://www.w3.org/2005/Atom" type="tuple">',
                          '<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata">':'<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata" type="dict">'}
   t2 = time.clock()
   print('read data from url elapsed time:',t2-t1)
   data = strToXMLstr(data,translationTable)
#   debugFile = open('modtreasure.xml','w')
#   debugFile.write('%s\n' % data)
#   debugFile.close()
   doc = minidom.parseString(data)
   root = doc.documentElement
   try:
      t1 = time.clock()
      document = anyxml(root)
      if (not usingHistory):
         curveFound = False
         for week_of_month in document.LIST_G_WEEK_OF_MONTH:
            for date_of_week in week_of_month.NEW_DATES: 
               if date_of_week.BID_CURVE_DATE == inputDate :
                  curveFound = True
                  print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
                  print('Tenor   Yield')
                  for k,v in date_of_week.G_BC_CAT.items():
                     print(k[3:],"{0}{1}".format(v,'%'))
                  break
         if not curveFound:
            print('For',inputDate,'US Treasure Curve not found: is it business date?')
            print('However, Curves are available for dates:')
            for week_of_month in document.LIST_G_WEEK_OF_MONTH:
               for date_of_week in week_of_month.NEW_DATES: 
                 print(date_of_week.BID_CURVE_DATE)
      else:
         curveFound = False 
         for entry in document.data.entries:
             tbl = entry.m_properties
             if tbl['d_NEW_DATE'] == inputDate:
                curveFound = True
                print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
                print('Tenor   Yield')
                for k,v in tbl.items():
                   if k.find('_BC_') !=-1:
                     print(k[5:],"{0}{1}".format(v,'%'))  
                break;
         if not curveFound:
            print('For',inputDate,'US Treasure Curve not found: is it business date?')      
      t1 = time.clock()-t1;
   except AnyXmlException as err:
      print(err.what)
   print("parse document elapsed time:",t1)

Diff to Previous Revision

--- revision 5 2012-02-09 05:30:58
+++ revision 6 2012-02-09 07:33:28
@@ -4,6 +4,9 @@
 from datetime import datetime as dt
 
 def make_date(d):
+   index = d.find('T')
+   if index != -1:
+     d = d[:index] 
    parts = d.split('-')
    dt_type='-'
    if(len(parts)==1):
@@ -14,7 +17,7 @@
       first_f ='%d'
       second_f = '%Y' if len(parts[2])==4 else '%y'
    else:
-      if int(parts[1])<12:
+      if int(parts[1])<=12:
          month_f = '%m'
          first_f = '%Y' if len(parts[0])==4 else '%d'
          if ( first_f == '%Y'):
@@ -124,7 +127,8 @@
        Yield Curve. The xml itself, fetched from url, is in terrible condition: meaningless names,
        all kind of date formats("02-03-2012","01-FEB-12","01/23/12","11/28/06" are used simultaneously, etc ...
        However, with little help of pre-processing, it is possible to convert that garbage into reasonable
-       python class. Run appliation as python anyxml.py 02/07/2012.     
+       python class. Actually, there are two different urls with two xml schema and in both cases it is possible
+       to retrieve datumm. Run appliation as python anyxml.py 02/07/2012 or 02/07/2011 for historic curve     
        Finally: the instance of resultant class is anyxml, but class iself will contain just 
        natural python' type  members with some exception: the named tuples are user defined classes
        and __class__ of those will be "anyxml.namedtuple", but __name__ is just "namedtuple".
@@ -197,6 +201,8 @@
                  nodeName = item[1]             
               if not nodeName[0].isalpha():
                  raise AnyXmlException('Badly formed xml: names in tuple or struct should be valid ')
+              if nodeName.find(':') !=-1:
+                 nodeName = nodeName.replace(':','_') 
               if iskeyword(nodeName):
                  nodeName = nodeName[:1].upper()+ nodeName[1:] 
               if len(names)==0 or not nodeName in names: 
@@ -245,10 +251,15 @@
               return 'bool'
            return 'str'  
         else:
-           components = content.split('-')  
+           index = content.find('T')
+           if index != -1:
+              input = content[:index]
+           else:
+              input = content 
+           components = input.split('-')  
            if(len(components)==1):
-              components = content.split('/')      
-           if (8<=len(content)<= 10)  and len(components) == 3:
+              components = input.split('/')      
+           if (8<=len(input)<= 10)  and len(components) == 3:
               return 'date'
            else:
               try:
@@ -320,13 +331,6 @@
    if (inputDate.year != today.year or inputDate.month !=today.month):
       usingHistory = True
       url = url_2+ str(inputDate.year)
-#   print(url)
-   if usingHistory :
-      print('Historical datum are located at diffrent url with other xml schema')
-      print('For time being use this application to retrive Treasure Curve for')
-      print('any business date, which is in the same month as current month and')
-      print('input date should be in the past(however in the same month) or today after 6:00PM')
-      sys.exit(-1)
    t1 = time.clock()
    urlfile = urllib.request.urlopen(url)
    data = urlfile.read().decode("utf8")
@@ -341,36 +345,52 @@
                           '<G_BC_CAT>':'<G_BC_CAT type="dict">'}
    else:
       translationTable = {'<pre>':'<pre><data type="tuple">',
-                          '</pre>':'</data></pre>'}
-                                  
-  
+                          '</pre>':'</entries></data></pre>',
+                          '<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" />':'<link rel="self" title="DailyTreasuryYieldCurveRateData" href="DailyTreasuryYieldCurveRateData" xmlns="http://www.w3.org/2005/Atom" /><entries type="list">',        
+                          '<entry xmlns="http://www.w3.org/2005/Atom">':'<entry xmlns="http://www.w3.org/2005/Atom" type="tuple">',
+                          '<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata">':'<m:properties xmlns:m="http://schemas.microsoft.com/ado/2007/08/dataservices/metadata" type="dict">'}
    t2 = time.clock()
    print('read data from url elapsed time:',t2-t1)
    data = strToXMLstr(data,translationTable)
-   debugFile = open('modtreasure.xml','w')
-   debugFile.write('%s\n' % data)
-   debugFile.close()
+#   debugFile = open('modtreasure.xml','w')
+#   debugFile.write('%s\n' % data)
+#   debugFile.close()
    doc = minidom.parseString(data)
    root = doc.documentElement
    try:
       t1 = time.clock()
       document = anyxml(root)
-      curveFound = False
-      for week_of_month in document.LIST_G_WEEK_OF_MONTH:
-          for date_of_week in week_of_month.NEW_DATES: 
-             if date_of_week.BID_CURVE_DATE == inputDate :
-                curveFound = True
-                print("{} {} {}".format('For',date_of_week.BID_CURVE_DATE,'US Treasure Curve is:'))
-                print('Tenor   Yield')
-                for k,v in date_of_week.G_BC_CAT.items():
-                   print(k[3:],"{0}{1}".format(v,'%'))
-                break
-      if not curveFound:
-         print('For',inputDate,'US Treasure Curve not found: is it business date?')
-         print('However, Curves are available for dates:')
+      if (not usingHistory):
+         curveFound = False
          for week_of_month in document.LIST_G_WEEK_OF_MONTH:
             for date_of_week in week_of_month.NEW_DATES: 
-              print(date_of_week.BID_CURVE_DATE)
+               if date_of_week.BID_CURVE_DATE == inputDate :
+                  curveFound = True
+                  print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
+                  print('Tenor   Yield')
+                  for k,v in date_of_week.G_BC_CAT.items():
+                     print(k[3:],"{0}{1}".format(v,'%'))
+                  break
+         if not curveFound:
+            print('For',inputDate,'US Treasure Curve not found: is it business date?')
+            print('However, Curves are available for dates:')
+            for week_of_month in document.LIST_G_WEEK_OF_MONTH:
+               for date_of_week in week_of_month.NEW_DATES: 
+                 print(date_of_week.BID_CURVE_DATE)
+      else:
+         curveFound = False 
+         for entry in document.data.entries:
+             tbl = entry.m_properties
+             if tbl['d_NEW_DATE'] == inputDate:
+                curveFound = True
+                print("{} {} {}".format('For',inputDate,'US Treasure Curve is:'))
+                print('Tenor   Yield')
+                for k,v in tbl.items():
+                   if k.find('_BC_') !=-1:
+                     print(k[5:],"{0}{1}".format(v,'%'))  
+                break;
+         if not curveFound:
+            print('For',inputDate,'US Treasure Curve not found: is it business date?')      
       t1 = time.clock()-t1;
    except AnyXmlException as err:
       print(err.what)

Recipe 578032 revision 6

Diff to Previous Revision

History

Accounts

Code Recipes

Feedback & Information

ActiveState