Recipe 440661 revision 3 « ActiveState Code

""" This modules provides a lightweight API to access Excel data.
    There are many ways to read Excel data, including ODBC. This module
    uses ADODB and has the advantage of only requiring a file name and a
    sheet name (no setup required).
"""

import win32com.client

class ExcelDocument(object):
    """ Represents an opened Excel document.
    """

    def __init__(self,filename):
        self.connection = win32com.client.Dispatch('ADODB.Connection')
        self.connection.Open(
            'PROVIDER=Microsoft.Jet.OLEDB.4.0;'+
            'DATA SOURCE=%s'%filename+
            ';Extended Properties="Excel 8.0;HDR=1;IMEX=1"'
        )

    def sheets(self):
        """ Returns a list of the name of the sheets found in the document.
        """
        result = []
        recordset = self.connection.OpenSchema(20)
        while not recordset.EOF:
            result.append(recordset.Fields[2].Value)
            recordset.MoveNext()
        recordset.Close()
        del recordset
        return result

    def sheet(self,name,encoding=None,order_by=None):
        """ Returns a sheet object by name. Use sheets() to obtain a list of
            valid names. encoding is a character encoding name which is used
            to encode the unicode strings returned by Excel, so that you get
            plain Python strings.
        """
        return ExcelSheet(self,name,encoding,order_by)
            
    def __del__(self):
        self.close()

    def close(self):
        """ Closes the Excel document. It is automatically called when
            the object is deleted.
        """
        try:
            self.connection.Close()
            del self.connection
        except:
            pass

def strip(value):
    """ Strip the input value if it is a string and returns None
        if it had only whitespaces """
    if isinstance(value,basestring):
        value = value.strip()
        if len(value)==0:
            return None
    return value

class ExcelSheet(object):
    """ Represents an Excel sheet from a document, gives methods to obtain
        column names and iterate on its content.
    """
    def __init__(self,document,name,encoding,order_by):
        self.document = document
        self.name = name
        self.order_by = order_by
        if encoding:
            def encoder(value):
                if isinstance(value,unicode):
                    value = value.strip()
                    if len(value)==0:
                        return None
                    else:
                        return value.encode(encoding)
                elif isinstance(value,str):
                    value = value.strip()
                    if len(value)==0:
                        return None
                    else:
                        return value 
                else:
                    return value
            self.encoding = encoder
        else:
            self.encoding = strip

    def columns(self):
        """ Returns a list of column names for the sheet.
        """
        recordset = win32com.client.Dispatch('ADODB.Recordset')
        recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
        try:
            return [self.encoding(field.Name) for field in recordset.Fields]
        finally:
            recordset.Close()
            del recordset
                
    def __iter__(self):
        """ Returns a paged iterator by default. See paged().
        """
        return self.paged()

    def naive(self):
        """ Returns an iterator on the data contained in the sheet. Each row
            is returned as a dictionary with row headers as keys.
        """
        # SLOW algorithm ! A lot of COM calls are performed.
        recordset = win32com.client.Dispatch('ADODB.Recordset')
        if self.order_by:
            recordset.Open(u'SELECT * FROM [%s] ORDER BY %s'%(self.name,self.order_by),self.document.connection,0,1)
        else:
            recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
        try:
            while not recordset.EOF:
                source = {}
                for field in recordset.Fields:
                    source[self.encoding(field.Name)] = self.encoding(field.Value)
                yield source
                recordset.MoveNext()
            recordset.Close()
            del recordset
        except:
            # cannot use "finally" here because Python doesn't want
            # a "yield" statement inside a "try...finally" block. 
            recordset.Close()
            del recordset
            raise
    
    def paged(self,pagesize=128):
        """ Returns an iterator on the data contained in the sheet. Each row
            is returned as a dictionary with row headers as keys. pagesize is
            the size of the buffer of rows ; it is an implementation detail but
            could have an impact on the speed of the iterator. Use pagesize=-1
            to buffer the whole sheet in memory.
        """
        # FAST algorithm ! It is about 10x faster than the naive algorithm
        # thanks to the use of GetRows, which dramatically decreases the number
        # of COM calls.
        recordset = win32com.client.Dispatch('ADODB.Recordset')
        if self.order_by:
            recordset.Open(u'SELECT * FROM [%s] ORDER BY %s'%(self.name,self.order_by),self.document.connection,0,1)
        else:
            recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
        try:
            fields = [self.encoding(field.Name) for field in recordset.Fields]
            ok = True
            while ok:
                # Thanks to Rogier Steehouder for the transposing tip 
                rows = zip(*recordset.GetRows(pagesize))
                
                if recordset.EOF:
                    # close the recordset as soon as possible
                    recordset.Close()
                    recordset = None
                    ok = False

                for row in rows:
                    yield dict(zip(fields, map(self.encoding,row)))
        except:
            if recordset is not None:
                recordset.Close()
                del recordset
            raise
Recipe 440661 revision 3

History

Accounts

Code Recipes

Feedback & Information

ActiveState