Read tabular data from Excel spreadsheets the fast and easy way « Python recipes

Sometimes you get an Excel spreadsheet (say, from the marketing departement) and you want to read tabular data from it (i.e. a line with column headers and lines of data). There are many ways to do this (including ODBC + mxODBC), but the easiest way I've found is this one : provide a file name and a sheet name, and read the data !

      """ This modules provides a lightweight API to access Excel data.
    There are many ways to read Excel data, including ODBC. This module
    uses ADODB and has the advantage of only requiring a file name and a
    sheet name (no setup required).
"""

import win32com.client

class ExcelDocument(object):
    """ Represents an opened Excel document.
    """

    def __init__(self,filename):
        self.connection = win32com.client.Dispatch('ADODB.Connection')
        self.connection.Open(
            'PROVIDER=Microsoft.Jet.OLEDB.4.0;'+
            'DATA SOURCE=%s'%filename+
            ';Extended Properties="Excel 8.0;HDR=1;IMEX=1"'
        )

    def sheets(self):
        """ Returns a list of the name of the sheets found in the document.
        """
        result = []
        recordset = self.connection.OpenSchema(20)
        while not recordset.EOF:
            result.append(recordset.Fields[2].Value)
            recordset.MoveNext()
        recordset.Close()
        del recordset
        return result

    def sheet(self,name,encoding=None,order_by=None):
        """ Returns a sheet object by name. Use sheets() to obtain a list of
            valid names. encoding is a character encoding name which is used
            to encode the unicode strings returned by Excel, so that you get
            plain Python strings.
        """
        return ExcelSheet(self,name,encoding,order_by)
            
    def __del__(self):
        self.close()

    def close(self):
        """ Closes the Excel document. It is automatically called when
            the object is deleted.
        """
        try:
            self.connection.Close()
            del self.connection
        except:
            pass

def strip(value):
    """ Strip the input value if it is a string and returns None
        if it had only whitespaces """
    if isinstance(value,basestring):
        value = value.strip()
        if len(value)==0:
            return None
    return value

class ExcelSheet(object):
    """ Represents an Excel sheet from a document, gives methods to obtain
        column names and iterate on its content.
    """
    def __init__(self,document,name,encoding,order_by):
        self.document = document
        self.name = name
        self.order_by = order_by
        if encoding:
            def encoder(value):
                if isinstance(value,unicode):
                    value = value.strip()
                    if len(value)==0:
                        return None
                    else:
                        return value.encode(encoding)
                elif isinstance(value,str):
                    value = value.strip()
                    if len(value)==0:
                        return None
                    else:
                        return value 
                else:
                    return value
            self.encoding = encoder
        else:
            self.encoding = strip

    def columns(self):
        """ Returns a list of column names for the sheet.
        """
        recordset = win32com.client.Dispatch('ADODB.Recordset')
        recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
        try:
            return [self.encoding(field.Name) for field in recordset.Fields]
        finally:
            recordset.Close()
            del recordset
                
    def __iter__(self):
        """ Returns a paged iterator by default. See paged().
        """
        return self.paged()

    def naive(self):
        """ Returns an iterator on the data contained in the sheet. Each row
            is returned as a dictionary with row headers as keys.
        """
        # SLOW algorithm ! A lot of COM calls are performed.
        recordset = win32com.client.Dispatch('ADODB.Recordset')
        if self.order_by:
            recordset.Open(u'SELECT * FROM [%s] ORDER BY %s'%(self.name,self.order_by),self.document.connection,0,1)
        else:
            recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
        try:
            while not recordset.EOF:
                source = {}
                for field in recordset.Fields:
                    source[self.encoding(field.Name)] = self.encoding(field.Value)
                yield source
                recordset.MoveNext()
            recordset.Close()
            del recordset
        except:
            # cannot use "finally" here because Python doesn't want
            # a "yield" statement inside a "try...finally" block. 
            recordset.Close()
            del recordset
            raise
    
    def paged(self,pagesize=128):
        """ Returns an iterator on the data contained in the sheet. Each row
            is returned as a dictionary with row headers as keys. pagesize is
            the size of the buffer of rows ; it is an implementation detail but
            could have an impact on the speed of the iterator. Use pagesize=-1
            to buffer the whole sheet in memory.
        """
        # FAST algorithm ! It is about 10x faster than the naive algorithm
        # thanks to the use of GetRows, which dramatically decreases the number
        # of COM calls.
        recordset = win32com.client.Dispatch('ADODB.Recordset')
        if self.order_by:
            recordset.Open(u'SELECT * FROM [%s] ORDER BY %s'%(self.name,self.order_by),self.document.connection,0,1)
        else:
            recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
        try:
            fields = [self.encoding(field.Name) for field in recordset.Fields]
            ok = True
            while ok:
                # Thanks to Rogier Steehouder for the transposing tip 
                rows = zip(*recordset.GetRows(pagesize))
                
                if recordset.EOF:
                    # close the recordset as soon as possible
                    recordset.Close()
                    recordset = None
                    ok = False

                for row in rows:
                    yield dict(zip(fields, map(self.encoding,row)))
        except:
            if recordset is not None:
                recordset.Close()
                del recordset
            raise

      

The implementation is quite straightforward, using the ADO COM library along with win32all (bundled into ActivePython !). The only trick here is the 10x performance improvement I've got when I switched from the "naive()" iterator algorithm to the "paged()" one, using Recordset.GetRows().

Update : This version now knows how to convert unicode strings to the character encoding of your choice, and uses Rogier Steehouder's trick for transposing the pages, which is a tad faster and much more readable.

Tags: database

13 comments

Minh Luu 18 years, 6 months ago # | flag

Excellent. Thank you for sharing this excellent recipe!

Rogier Steehouder 18 years, 4 months ago # | flag

Small change. It was very useful, thanks. I made the following adaption, though.

def paged(self, psize=128):
    recset = Dispatch('ADODB.Recordset')
    recset.Open(u'SELECT * FROM [%s]' % self.name, self.doc.conn, 0, 1)
    try:
        fields = [field.Name for field in recset.Fields]
        ok = True
        while ok:
            # transpose: rows become columns and columns become rows
            rows = zip(*recset.GetRows(psize))
            if recset.EOF:
                recset.Close()
                recset = None
                ok = False
            for row in rows:
                yield dict(zip(fields, row))
    except:
        if recset is not None:
            recset.Close()
            del recset
        raise

Nicolas Lehuen (author) 18 years, 4 months ago # | flag

Good idea ! Thanks for the transposing tip :)

Michael Lim 18 years, 3 months ago # | flag

Error when used in threads. This code is extremely awesome, however when I tried using it inside a thread, this error pops up:

Exception in thread Thread-1:Traceback (most recent call last):

File "D:\Python23\Lib\threading.py", line 442, in __bootstrap

self.run()

File "D:\mlim\SPS-Python\ITLWWMRToCGenerator\Script1.py", line 18, in run

mainClass.getExcelList()

File "D:\mlim\SPS-Python\ITLWWMRToCGenerator\clsMain.py", line 93, in getExcelList

clsE = ExcelDocument(r"D:\mlim\SPS-Python\ITLWWMRToCGenerator\excel\cin24-1.

xls")

File "D:\mlim\SPS-Python\ITLWWMRToCGenerator\clsMain.py", line 104, in __init__

self.connection = win32com.client.Dispatch('ADODB.Connection')

File "D:\Python23\lib\site-packages\win32com\client__init__.py", line 95, in

Dispatch dispatch, userName = dynamic._GetGoodDispatchAndUserName(dispatch,userName,c lsctx)

File "D:\Python23\lib\site-packages\win32com\client\dynamic.py", line 84, in _ GetGoodDispatchAndUserName

return (_GetGoodDispatch(IDispatch, clsctx), userName)

File "D:\Python23\lib\site-packages\win32com\client\dynamic.py", line 72, in _GetGoodDispatch

IDispatch = pythoncom.CoCreateInstance(IDispatch, None, clsctx, pythoncom.IID_IDispatch)

com_error: (-2147221008, 'CoInitialize has not been called.', None, None)

Nicolas Lehuen (author) 18 years, 1 month ago # | flag

This is normal. When using COM objects in a threaded context, you have to make sure to call pythoncom.CoInitialize(). See for example :

http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/347209

Mark Kanter 17 years, 5 months ago # | flag

Ordinal position choice instead. I needed to access the fields by ordinal position, so not to bother my customer who does not send first row as headers in their excel files

So I changed this line:

fields = [self.encoding(field.Name) for field in recordset.Fields]

to this: fields = [self.encoding(field) for field in range recordset.Fields.Count)]

and now can access the fields I want like this:

xls = ExcelAPI.ExcelDocument(mypath) sheetslist = xls.sheets() sheetnametoread = sheetslist[0] #want first sheet, could be named whatever rows = ExcelAPI.ExcelSheet(xls,sheetnametoread,0,0) for row in rows: print row[1],row[2],row[5], query= "Insert into CountyKeyedDeedInfo (DeedType,Book,CountyKeyedID) values('%s',%s,%s)" %(row[1], row[2], row[5] ) ret=cur.execute(query)

Grant Paton-Simpson 16 years, 5 months ago # | flag

Coping with 'Too many fields defined.' when getting list of column names. Sometimes a worksheet treats empty columns as being part of its recordset (you can tell by seeing the width of the slider on the scrollbar - if it is narrow then this issue may apply). The number then exceeds 255, and an error is thrown saying 'Too many fields defined.'. Here is how to trap the error:

def columns(self):
    """ Returns a list of column names for the sheet.
    """
    recordset = win32com.client.Dispatch('ADODB.Recordset')
    try:
        recordset.Open(u'SELECT * FROM [%s]' % self.name, self.document.connection,0,1)
    except Exception, e:
        if e[2][2] == 'Too many fields defined.':
            raise Exception, "Delete blank columns at end of worksheet to keep within " + \
                                "JET 255 column limit.  See http://support.microsoft.com/kb/198504/EN-US/"
        else:
            raise
    try:
        return [self.encoding(field.Name) for field in recordset.Fields]
    finally:
        recordset.Close()
        del recordset

One solution is to select all rightmost columns in the spreadsheet and delete them. The slider on the scrollbar should increase in width.

keith 15 years, 2 months ago # | flag

Hi there,

This is EXTREMELY useful. I was wondering if you could give me an example of how to use this recipe to say copy the values (not formula) from all the sheets in one spreadsheet to another one and save it?

Much appreciated.

Marie Roch 14 years, 3 months ago # | flag

Thank you Nicolas for a very useful module. There appears to be a small bug when using this on an empty page that ocurred in a multi-page spreadsheet. Line 156 (as of 22 Jan 2010) initializes the loop control variable ok to true unconditionally. Based on my limited knowledge of ADO Recordsets, I believe that this should be changed to:

ok = not (recordset.BOF == True and recordset.EOF == True)

as empty record sets are indicated by setting the begin and end of file variables to true.

Cheers, Marie

John 13 years, 10 months ago # | flag

Very useful code. I needed this with Python 3.1 to read an Excel 2007 file with a .xlsx extension. The original code failed with the driver and also failed to compile in Py31 due to the "u'SELECT...'%..." substitutions.

The code worked after I made the following changes: 1) Changed the self.connection.Open( parameter on line 15 with: self.connection.Open( 'PROVIDER=Microsoft.ACE.OLEDB.12.0;'+ 'DATA SOURCE=%s'%filename+ ';Extended Properties="Excel 12.0 Xml;HDR=1;IMEX=1"'

2) Replaced all "u'SELECT..'%" strings with: "'SELECT...'.format(..."

3) Replace basestring with str

Thanks, John

Raghuram 13 years, 8 months ago # | flag

Hello Nicolas Lehuen!

I have excelsheet with the name "checkpoints.xls". I created object for ExcelDocument and tried to pull the data. I am getting the sheetnames and error message saying

[u'Sheet1$', u'Sheet1$_FilterDatabase'] Traceback (most recent call last): File "queryExcel.py", line 174, in ? print a.columns() File "queryExcel.py", line 92, in columns recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)

AttributeError: 'str' object has no attribute 'connection'

I added following code next to your code:

if __name__ == '__main__': demo = ExcelDocument(r"c:\checkpoints.xls") print demo.sheets()
a = ExcelSheet(r"c:\checkpoints.xls","sheet1","None","ASC") print a.columns()

Please suggest me to get data from excelsheet with out above error.

Thanks, Raghuram Reddy G

Arun Ragini 13 years, 8 months ago # | flag

Hi,

When I tried the code, I got the following error, I would be very thank full, if some one could provide me the solution.

demo = ExcelSheet(r"C:\Documents and Settings\akanthra\Desktop\DLM_CHECKPOINTS.xls", 'Reader', None, None)

>>> print demo.columns()
Traceback (most recent call last):
  File "<pyshell#1>", line 1, in <module>
    print demo.columns()
  File "C:\Documents and Settings\akanthra\Desktop\ExcelRead.py", line 96, in columns
    recordset.Open(u'SELECT * FROM [%s]'%self.name,self.document.connection,0,1)
AttributeError: 'str' object has no attribute 'connection'
>>> demo.document
'C:\\Documents and Settings\\akanthra\\Desktop\\DLM_CHECKPOINTS.xls'

Arun Ragini 13 years, 8 months ago # | flag

finally, I got it working. Thanks for the excellent code

For newbie

>>> demo = ExcelDocument(r'path of excel document')
<h4 id="prints-workbooksheets-names">prints workbook/sheets names</h4>



>>> print demo.sheets()
<h4 id="load-workbooksheet-name-should-match-from-demosheets">load workbook/sheet, name should match from demo.sheets.</h4>



>>> mysheet = demo.sheet('name from demo.sheet')
<h4 id="print-the-data">print the data</h4>



>>> for data in mysheet.paged():
    print data

◄	Python recipes (4591)	►
◄	Nicolas Lehuen's recipes (7)	►

Read tabular data from Excel spreadsheets the fast and easy way (Python recipe) by Nicolas Lehuen
ActiveState Code (http://code.activestate.com/recipes/440661/)

13 comments

fields = [self.encoding(field.Name) for field in recordset.Fields]

For newbie

Forks

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Read tabular data from Excel spreadsheets the fast and easy way (Python recipe) by Nicolas Lehuen ActiveState Code (http://code.activestate.com/recipes/440661/)

13 comments

fields = [self.encoding(field.Name) for field in recordset.Fields]

For newbie

Forks

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Read tabular data from Excel spreadsheets the fast and easy way (Python recipe) by Nicolas Lehuen
ActiveState Code (http://code.activestate.com/recipes/440661/)