Welcome, guest | Sign In | My Account | Store | Cart
""" This modules provides a lightweight API to access Excel data.
    There are many ways to read Excel data, including ODBC. This module
    uses ADODB and has the advantage of only requiring a file name and a
    sheet name (no setup required).
"""


import win32com.client

class ExcelDocument(object):
   
""" Represents an opened Excel document.
    """


   
def __init__(self,filename):
       
self.connection = win32com.client.Dispatch('ADODB.Connection')
       
self.connection.Open(
           
'PROVIDER=Microsoft.Jet.OLEDB.4.0;'+
           
'DATA SOURCE=%s'%filename+
           
';Extended Properties="Excel 8.0;HDR=1;IMEX=1"'
       
)

   
def sheets(self):
       
""" Returns a list of the name of the sheets found in the document.
        """

        result
= []
        recordset
= self.connection.OpenSchema(20)
       
while not recordset.EOF:
            result
.append(recordset.Fields[2].Value)
            recordset
.MoveNext()
        recordset
.Close()
       
del recordset
       
return result

   
def sheet(self,name,encoding=None,order_by=None):
       
""" Returns a sheet object by name. Use sheets() to obtain a list of
            valid names. encoding is a character encoding name which is used
            to encode the unicode strings returned by Excel, so that you get
            plain Python strings.
        """

       
return ExcelSheet(self,name,encoding,order_by)
           
   
def __del__(self):
       
self.close()

   
def close(self):
       
""" Closes the Excel document. It is automatically called when
            the object is deleted.
        """

       
try:
           
self.connection.Close()
           
del self.connection
       
except:
           
pass

def strip(value):
   
""" Strip the input value if it is a string and returns None
        if it had only whitespaces """

   
if isinstance(value,basestring):
        value
= value.strip()
       
if len(value)==0:
           
return None
   
return value

class ExcelSheet(object):
   
""" Represents an Excel sheet from a document, gives methods to obtain
        column names and iterate on its content.
    """

   
def __init__(self,document,name,encoding,order_by):
       
self.document = document
       
self.name = name
       
self.order_by = order_by
       
if encoding:
           
def encoder(value):
               
if isinstance(value,unicode):
                    value
= value.strip()
                   
if len(value)==0:
                       
return None
                   
else:
                       
return value.encode(encoding)
               
elif isinstance(value,str):
                    value
= value.strip()
                   
if len(value)==0:
                       
return None
                   
else:
                       
return value
               
else:
                   
return value
           
self.encoding = encoder
       
else:
           
self.encoding = strip

   
def columns(self):
       
""" Returns a list of column names for the sheet.
        """

        recordset
= win32com.client.Dispatch('ADODB.Recordset')
        recordset
.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
       
try:
           
return [self.encoding(field.Name) for field in recordset.Fields]
       
finally:
            recordset
.Close()
           
del recordset
               
   
def __iter__(self):
       
""" Returns a paged iterator by default. See paged().
        """

       
return self.paged()

   
def naive(self):
       
""" Returns an iterator on the data contained in the sheet. Each row
            is returned as a dictionary with row headers as keys.
        """

       
# SLOW algorithm ! A lot of COM calls are performed.
        recordset
= win32com.client.Dispatch('ADODB.Recordset')
       
if self.order_by:
            recordset
.Open(u'SELECT * FROM [%s] ORDER BY %s'%(self.name,self.order_by),self.document.connection,0,1)
       
else:
            recordset
.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
       
try:
           
while not recordset.EOF:
                source
= {}
               
for field in recordset.Fields:
                    source
[self.encoding(field.Name)] = self.encoding(field.Value)
               
yield source
                recordset
.MoveNext()
            recordset
.Close()
           
del recordset
       
except:
           
# cannot use "finally" here because Python doesn't want
           
# a "yield" statement inside a "try...finally" block.
            recordset
.Close()
           
del recordset
           
raise
   
   
def paged(self,pagesize=128):
       
""" Returns an iterator on the data contained in the sheet. Each row
            is returned as a dictionary with row headers as keys. pagesize is
            the size of the buffer of rows ; it is an implementation detail but
            could have an impact on the speed of the iterator. Use pagesize=-1
            to buffer the whole sheet in memory.
        """

       
# FAST algorithm ! It is about 10x faster than the naive algorithm
       
# thanks to the use of GetRows, which dramatically decreases the number
       
# of COM calls.
        recordset
= win32com.client.Dispatch('ADODB.Recordset')
       
if self.order_by:
            recordset
.Open(u'SELECT * FROM [%s] ORDER BY %s'%(self.name,self.order_by),self.document.connection,0,1)
       
else:
            recordset
.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
       
try:
            fields
= [self.encoding(field.Name) for field in recordset.Fields]
            ok
= True
           
while ok:
               
# Thanks to Rogier Steehouder for the transposing tip
                rows
= zip(*recordset.GetRows(pagesize))
               
               
if recordset.EOF:
                   
# close the recordset as soon as possible
                    recordset
.Close()
                    recordset
= None
                    ok
= False

               
for row in rows:
                   
yield dict(zip(fields, map(self.encoding,row)))
       
except:
           
if recordset is not None:
                recordset
.Close()
               
del recordset
           
raise

History

  • revision 3 (18 years ago)
  • previous revisions are not available