Welcome, guest | Sign In | My Account | Store | Cart
#!/usr/bin/env python
# -*- coding: cp1252 -*-

""" A Python interface to the IBM SPSS Statistics Input Output Module
(Windows: spssio32.dll)"""

# spssio32.dll and documentation can be downloaded here:
# https://www.ibm.com/developerworks/mydeveloperworks/wikis/home/wiki/We70df3195ec8_4f95_9773_42e448fa9029/page/Downloads%20for%20IBM%C2%AE%20SPSS%C2%AE%20Statistics?lang=en
# TO DO: make this work under Linux. When I tried the .so file,
# I had a hard time finding all the necessary dependencies.
# The .so file versions that are needed are rather old.
# NOTE: This is rather slow in processing huge files.
# ANY FEEDBACK ON THIS CODE IS WELCOME: "@".join(["fomcl", "yahoo.com"])

from __future__ import with_statement # only Python 2.5
import sys, os, ctypes, datetime
try:
    import psyco
    psyco.full()
except ImportError:
    pass

__author__  =  'Albert-Jan Roskam'
__version__ =  '1.0.0'

retcodes =    {0: 'SPSS_OK',
               1: 'SPSS_EXC_LEN64',
               2: 'SPSS_EXC_VARLABEL',
               3: 'SPSS_FILE_RERROR',
               4: 'SPSS_EXC_VALLABEL',
               5: 'SPSS_FILE_END',
               6: 'SPSS_NO_VARSETS',
               7: 'SPSS_EMPTY_VARSETS',
               8: 'SPSS_NO_LABELS',
               9: 'SPSS_NO_LABEL',
               10: 'SPSS_NO_CASEWGT',
               11: 'SPSS_NO_DATEINFO',
               12: 'SPSS_NO_MULTRESP',
               13: 'SPSS_EMPTY_MULTRESP',
               14: 'SPSS_NO_DEW',
               15: 'SPSS_EMPTY_DEW',
               16: 'SPSS_SHORTSTR_EXP',
               17: 'SPSS_INVALID_VARTYPE',
               18: 'SPSS_INVALID_MISSFOR',
               19: 'SPSS_INVALID_COMPSW',
               20: 'SPSS_INVALID_PRFOR',
               21: 'SPSS_INVALID_WRFOR',
               22: 'SPSS_INVALID_DATE',
               23: 'SPSS_INVALID_TIME',
               24: 'SPSS_NO_VARIABLES',
               25: 'SPSS_MIXED_TYPES',
               27: 'SPSS_DUP_VALUE',
               28: 'SPSS_INVALID_CASEWGT',
               29: 'SPSS_INCOMPATIBLE_DICT',
               30: 'SPSS_DICT_COMMIT',
               31: 'SPSS_DICT_NOTCOMMIT',
               33: 'SPSS_NO_TYPE2',
               41: 'SPSS_NO_TYPE73',
               45: 'SPSS_INVALID_DATEINFO',
               46: 'SPSS_NO_TYPE999',
               47: 'SPSS_EXC_STRVALUE',
               48: 'SPSS_CANNOT_FREE',
               49: 'SPSS_BUFFER_SHORT',
               50: 'SPSS_INVALID_CASE',
               51: 'SPSS_INTERNAL_VLABS',
               52: 'SPSS_INCOMPAT_APPEND',
               53: 'SPSS_INTERNAL_D_A',
               54: 'SPSS_FILE_BADTEMP',
               55: 'SPSS_DEW_NOFIRST',
               56: 'SPSS_INVALID_MEASURELEVEL',
               57: 'SPSS_INVALID_7SUBTYPE',
               58: 'SPSS_INVALID_VARHANDLE',
               59: 'SPSS_INVALID_ENCODING',
               60: 'SPSS_FILES_OPEN',
               70: 'SPSS_INVALID_MRSETDEF',
               71: 'SPSS_INVALID_MRSETNAME',
               72: 'SPSS_DUP_MRSETNAME',
               73: 'SPSS_BAD_EXTENSION',
               74: 'SPSS_INVALID_EXTENDEDSTRING',
               75: 'SPSS_INVALID_ATTRNAME',
               76: 'SPSS_INVALID_ATTRDEF',
               77: 'SPSS_INVALID_MRSETINDEX',
               78: 'SPSS_INVALID_VARSETDEF',
               79: 'SPSS_INVALID_ROLE'}

printTypes =  {1: ('SPSS_FMT_A', 'Alphanumeric'),
               2: ('SPSS_FMT_AHEX', 'Alphanumeric hexadecimal'),
               3: ('SPSS_FMT_COMMA', 'F Format with commas'),
               4: ('SPSS_FMT_DOLLAR', 'Commas and floating dollar sign'),
               5: ('SPSS_FMT_F', 'Default Numeric Format'),
               6: ('SPSS_FMT_IB', 'Integer binary'),
               7: ('SPSS_FMT_PIBHEX', 'Positive integer binary - hex'),
               8: ('SPSS_FMT_P', 'Packed decimal'),
               9: ('SPSS_FMT_PIB', 'Positive integer binary unsigned'),
               10: ('SPSS_FMT_PK', 'Positive integer binary unsigned'),
               11: ('SPSS_FMT_RB', 'Floating point binary'),
               12: ('SPSS_FMT_RBHEX', 'Floating point binary hex'),
               15: ('SPSS_FMT_Z', 'Zoned decimal'),
               16: ('SPSS_FMT_N', 'N Format- unsigned with leading 0s'),
               17: ('SPSS_FMT_E', 'E Format- with explicit power of 10'),
               20: ('SPSS_FMT_DATE', 'Date format dd-mmm-yyyy'),
               21: ('SPSS_FMT_TIME', 'Time format hh:mm:ss.s'),
               22: ('SPSS_FMT_DATE_TIME', 'Date and Time'),
               23: ('SPSS_FMT_ADATE', 'Date format dd-mmm-yyyy'),
               24: ('SPSS_FMT_JDATE', 'Julian date - yyyyddd'), 
               25: ('SPSS_FMT_DTIME', 'Date-time dd hh:mm:ss.s'),
               26: ('SPSS_FMT_WKDAY', 'Day of the week'),
               27: ('SPSS_FMT_MONTH', 'Month'),
               28: ('SPSS_FMT_MOYR', 'mmm yyyy'),
               29: ('SPSS_FMT_QYR', 'q Q yyyy'),
               30: ('SPSS_FMT_WKYR', 'ww WK yyyy'),
               31: ('SPSS_FMT_PCT', 'Percent - F followed by %'),
               32: ('SPSS_FMT_DOT', 'Like COMMA, switching dot for comma'),
               33: ('SPSS_FMT_CCA', 'User Programmable currency format'),
               34: ('SPSS_FMT_CCB', 'User Programmable currency format'),
               35: ('SPSS_FMT_CCC', 'User Programmable currency format'),
               36: ('SPSS_FMT_CCD', 'User Programmable currency format'),
               37: ('SPSS_FMT_CCE', 'User Programmable currency format'),
               38: ('SPSS_FMT_EDATE','Date in dd/mm/yyyy style'),
               39: ('SPSS_FMT_SDATE', 'Date in yyyy/mm/dd style')}

class SavReader(object):
    """ Read Spss system files (.sav)

    Parameters:
    -savFileName: the file name of the spss data file
    -returnHeader: Boolean that indicates whether the first record should
        be a list of variable names (default is True)
    -recodeSysmisTo: indicates to which value missing values should
        be recoded (default = ""),
    -selectVars: indicates which variables in the file should be selected.
        The variables should be specified as a list or a tuple of
        valid variable names. If None is specified, all variables
        in the file are used (default is None)
    -verbose: Boolean that indicates whether information about the spss data file
        (e.g., number of cases, variable names, file size) should be printed on
        the screen (default: True).            

    Typical use:
    savFileName = "d:/someFile.sav"
    with SavReader(savFileName) as sav:
        header = sav.next()
        for line in sav:
            process(line)
    """

    def __init__(self, savFileName, returnHeader=True, recodeSysmisTo="",
                 selectVars=None, verbose=True):
        """ Constructor. Initializes all vars that can be recycled """

        self.savFileName = savFileName
        self.returnHeader = returnHeader
        self.recodeSysmisTo = recodeSysmisTo
        self.selectVars = selectVars
        self.verbose = verbose
                                          
        self.CUT_OFF = 10**-10 # values below cut-off are considered sysmis     
        self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)

        self.numVars = ctypes.c_int()
        self.numVarsPtr = ctypes.byref(self.numVars)

        self.nCases = ctypes.c_long()
        self.numofCasesPtr = ctypes.byref(self.nCases)
        
        self.printType = ctypes.c_int()
        self.printDec = ctypes.c_int()
        self.printWid = ctypes.c_int()
        self.printTypePtr = ctypes.byref(self.printType)
        self.printDecPtr = ctypes.byref(self.printDec)
        self.printWidPtr = ctypes.byref(self.printWid)

        self.attribNames = ctypes.c_char_p()
        self.attribText = ctypes.c_char_p()
        self.nAttributes = ctypes.c_int()
        self.attribNamesPtr = ctypes.byref(self.attribNames)
        self.attribTextPtr = ctypes.byref(self.attribText)
        self.nAttributesPtr = ctypes.byref(self.nAttributes)

        self.numValue = ctypes.c_double()
        self.numValuePtr = ctypes.byref(self.numValue)
        self.assumedCharWid = 200
        self.charValue = ctypes.create_string_buffer(self.assumedCharWid)
        self.charValuePtr = ctypes.byref(self.charValue)
        self.valueSize = ctypes.c_int(self.assumedCharWid)

        self.retcode, self.spssio, self.fh, self.varHandles, self.numVars_, \
          self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \
          self.printTypeLabels, self.varWids = self.readBasicSavFileInfo()

    def __enter__(self):
        """ This function opens the spss data file."""
        return self.readSavFile(self.returnHeader, self.recodeSysmisTo,
                                self.selectVars)

    def __exit__(self, type, value, tb):
        """ This function closes the spss data file."""
        if type is not None:
            pass # Exception occurred
        self.spssio.spssCloseRead(self.fh)
        
    def readBasicSavFileInfo(self):
        """ This function reads and returns some basic information of the open
        spss data file. It returns the following variables:
        retcode: the return code (0 means OK)
        spssio: the spss i/o C module, opened with ctypes.windll.spssio32
        fh: the file handle
        varHandles: a dictionary with var names as keys and var handles as values
        numVars: the number of variables in the spss data file
        nCases: the number of cases (records) in the spss data file
        varNames: a list of the var names  in the spss data file
        varTypes: a dictionary with var names as keys and var types as values
        printTypesFile: a dictionary with var names as keys and print types as values
        printTypeLabels: a dictionary with var names as keys and print type labels as values
        varWids: : a dictionary with var names as keys and var widths as values
        """
        self.retcode, self.spssio, self.fh = self.loadSavFile(self.savFileName)
        numVars = self.getNumberofVariables(self.fh, self.spssio)[1]
        nCases = self.getNumberofCases(self.fh, self.spssio)[1]
        varNames, varTypes_ = self.getVarInfo(self.fh, self.spssio)

        varTypes, printTypesFile, varWids, varHandles, printDecs, \
                  printWids = {}, {}, {}, {}, {}, {}
        for i, varName in enumerate(varNames):
            varTypes[varName] = varTypes_[i]
            retcode, printType, printDec, printWid = \
                     self.getVarPrintFormat(self.fh, self.spssio,
                                            variable=varName)
            printTypesFile[varName] = printType
            varWids[varName] = printWid
            varHandles[varName] = self.getVarHandle(self.fh, self.spssio,
                                                    variable=varName)[1]
            printDecs[varName] = printDec
            printWids[varName] = printWid
            
        printTypeLabels = dict([(varName,
                                 printTypes[printType][0])
                                for varName, printType in printTypesFile.iteritems()])

        fmts = dict([(varName, printTypeLabels[varName].split("_")[-1])
                     for varName in varNames])
        if self.verbose:
            self.getFileReport(self.savFileName, varNames, varTypes, fmts,
                               printDecs, printWids, nCases)

        return retcode, self.spssio, self.fh, varHandles, numVars, nCases, varNames, \
               varTypes, printTypesFile, printTypeLabels, varWids
    
    def loadSavFile(self, savFileName):
        """ This function loads the spss I/O file (.dll or .so file) and opens
        the spss data file for reading."""
        if sys.platform.lower().startswith("win"):
            try:
                os.environ["PATH"] += ";" + os.path.abspath(".")
                ctypes.cdll.LoadLibrary("spssio32.dll")
                spssio = ctypes.windll.spssio32
                libc = ctypes.cdll.msvcrt
            except WindowsError, e:
                msg = "Cannot find spssio32.dll in '%s'.\n" % os.path.abspath(".") + \
                      "Py file and Dll should live in the same directory [%s]." % e
                raise Exception, msg
                
    ##    elif sys.platform.lower().startswith("linux"):
    ##        os.environ["PATH"] += ":" + os.path.abspath(".")
    ##        ctypes.cdll.LoadLibrary("libspssdio.so.1")
    ##        spssio = ctypes.CDLL("libspssdio.so")
    ##        ctypes.cdll.LoadLibrary("libc.so.6")
    ##        libc = ctypes.CDLL("libc.so.6")

        if os.path.exists(self.savFileName):
            fh = libc._fdopen(self.savFileName, "rb") # fopen() on linux
            fhPtr = ctypes.byref(ctypes.c_int(fh))
            retcode = spssio.spssOpenRead(ctypes.c_char_p(self.savFileName), fhPtr)
            return retcode, spssio, fh
        else:
            raise Exception, "File '%s' does not exist!" % fn
            
    def getNumberofVariables(self, fh, spssio):
        """ This function reports the number of variables present in a data file."""
        retcode = spssio.spssGetNumberofVariables(fh, self.numVarsPtr)
        return retcode, self.numVars.value

    def getVarNameAndType(self, fh, spssio, iVar):
        """ Get variable name and type. The variable type code is an integer
        in the range 0–32767, 0 indicating a numeric variable and a positive
        value indicating a string variable of that size."""
        varNameBuff = ctypes.create_string_buffer(65)
        varNamePtr = ctypes.byref(varNameBuff)
        varType = ctypes.c_int()
        varTypePtr = ctypes.byref(varType)
        retcode = spssio.spssGetVarInfo(fh, iVar, varNamePtr, varTypePtr)
        return varNameBuff.value, varType.value

    def getVarInfo(self, fh, spssio):
        """ This function gets the name and type of one of the variables
        present in a data file."""
        spssio.spssGetNumberofVariables(fh, self.numVarsPtr)
        varNames, varTypes = [], []
        for iVar in range(self.numVars.value):
            varName, varType = self.getVarNameAndType(fh, spssio, iVar)
            varNames.append(varName)
            varTypes.append(varType)
        return varNames, varTypes

    def getNumberofCases(self, fh, spssio):
        """ This function reports the number of cases present in a data file"""
        retcode = spssio.spssGetNumberofCases(fh, self.numofCasesPtr)
        return retcode, self.nCases.value

    def getVarHandle(self, fh, spssio, variable):
        """This function returns a handle for a variable, which can then be
        used to read values of the variable."""
        self.varName = ctypes.c_char_p(variable)
        self.varHandle = ctypes.c_double()
        self.varHandlePtr = ctypes.byref(self.varHandle)
        retcode = spssio.spssGetVarHandle(fh, self.varName, self.varHandlePtr)
        return retcode, self.varHandle

    def getVarAttributes(self, fh, spssio, variable):
        """This function returns all the attributes for a single variable."""
        retcode = spssio.spssGetVarAttributes(fh,
                                              self.varName,
                                              self.attribNamesPtr,
                                              self.attribTextPtr,
                                              self.nAttributesPtr)
        return retcode, self.attribNames.value, self.attribText.value, \
               self.nAttributes.value

    def getValueNumeric(self, fh, c_func, varHandle):
        """ This function gets the value of a numeric variable for the current
        case, which is the case read by the most recent call to
        spssReadCaseRecord."""
        retcode = c_func(fh, varHandle, self.numValuePtr)
        return retcode, self.numValue.value

    def getValueChar(self, fh, c_func, varHandle):
        """This function gets the value of a string variable for the current
        case, which is the case read by the most recent call to
        spssReadCaseRecord."""
        retcode = c_func(fh, varHandle, self.charValuePtr, self.valueSize)
        return retcode, self.charValue.value

    def getVarPrintFormat(self, fh, spssio, variable):
        """ This function reports the print format of a variable. Format
        type, number of decimal places, and field width are returned. """
        self.varName = ctypes.c_char_p(variable)
        retcode = spssio.spssGetVarPrintFormat(fh,
                                self.varName,
                                self.printTypePtr,
                                self.printDecPtr,
                                self.printWidPtr)
        return retcode, self.printType.value, self.printDec.value, \
               self.printWid.value

    ##def getSystemSysmisVal(spssio):
    ##    return spssio.spssSysmisVal()

    def formatValue(self, fh, spssio, variable, value, printTypeLabel,
                    varWid, recodeSysmisTo):
        """ This function formats date fields to ISO dates (yyyy-mm-dd), plus
        some other date/time formats. The SPSS N format is formatted to a
        character value with leading zeroes."""
        supportedDates = {'SPSS_FMT_DATE':     '%Y-%m-%d',
                          'SPSS_FMT_JDATE':    '%Y-%m-%d',
                          'SPSS_FMT_EDATE':    '%Y-%m-%d',
                          'SPSS_FMT_SDATE':    '%Y-%m-%d',
                          'SPSS_FMT_DATE_TIME':'%Y-%m-%d %H:%M:%S',
                          'SPSS_FMT_WKDAY':    '%A %H:%M:%S',
                          'SPSS_FMT_ADATE':    '%Y-%m-%d',
                          'SPSS_FMT_WKDAY':    '%A',
                          'SPSS_FMT_MONTH':    '%B',
                          'SPSS_FMT_MOYR':     '%B %Y',
                          'SPSS_FMT_WKYR':     '%W WK %Y'}
        value = recodeSysmisTo if value < self.CUT_OFF else value
        if printTypeLabel in supportedDates:
            fmt = supportedDates[printTypeLabel]
            return self.spss2strDate(value, fmt, recodeSysmisTo)
        elif printTypeLabel == 'SPSS_FMT_N':
            value = str(value).zfill(varWid)
            return value
        else:
            return value
    
    def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo):
        """ This function converts internal SPSS dates (number of seconds
        since midnight, Oct 14, 1582 (the beginning of the Gregorian calendar))
        to a human-readable format """
        try:
            theDate = self.gregorianEpoch + datetime.timedelta(seconds=spssDateValue)
            return datetime.datetime.strftime(theDate, fmt)
        except TypeError:
            return recodeSysmisTo
        except ValueError:
            return recodeSysmisTo
        except OverflowError:
            return recodeSysmisTo

    def getFileReport(self, savFileName, varNames, varTypes, fmts, printDecs,
                      printWids, nCases):
        """ This function prints a report about basic file characteristics """
        bytes = os.path.getsize(savFileName)
        kb = float(bytes) / 2**10
        mb = float(bytes) / 2**20
        (fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB")
        print "*" * 70
        print "*File '%s' (%5.2f %s) has %s columns (variables) and %s rows (%s values)" % \
              (savFileName, fileSize, label, len(varNames), nCases, len(varNames) * nCases)
        print "*It contains the following variables:"
        for cnt, varName in enumerate(varNames):
            label = "string" if varTypes[varName] > 0 else "numerical"
            print "%03d. %s (%s%d.%d - %s)" \
                  % (cnt+1, varName, fmts[varName], printWids[varName], printDecs[varName], label)
        print "*" * 70

    def readSavFile(self, returnHeader=True, recodeSysmisTo="", selectVars=None):
        """ This is the main function of this class. It is a generator, which
        returns one record of the spss data file at a time. """
        
        debug = False
        if retcodes[self.retcode] == "SPSS_OK":
            if not isinstance(selectVars, (list, tuple)) and selectVars is not None:
                raise Exception, "Variable names list misspecified." + \
                      "Must be 'None' or a list or tuple of existing variables"                    
            if selectVars is not None:
                if set(selectVars).intersection(set(self.varNames)):
                    self.varNames = selectVars
                else:
                    raise Exception, "Variable names list misspecified"

            if returnHeader:
                yield self.varNames

            # avoiding dots inside the loops
            # http://wiki.python.org/moin/PythonSpeed/PerformanceTips#Avoiding_dots...
            readCaseRecord = self.spssio.spssReadCaseRecord
            spssGetValueNumeric = self.spssio.spssGetValueNumeric
            spssGetValueChar = self.spssio.spssGetValueChar 
            for i in range(self.nCases_):
                readCaseRecord(self.fh)
                record = []
                for varName in self.varNames:
                    # numerical values
                    if self.varTypes[varName] == 0:
                        rawValue = self.getValueNumeric(self.fh, spssGetValueNumeric,
                                                        self.varHandles[varName])[1]
                        if self.printTypeLabels[varName] == 'SPSS_FMT_F':
                            # this is assumed to be the most common format, by far.
                            # there will be no need to call the (expensive) formatValue function
                            value = rawValue if rawValue > self.CUT_OFF else recodeSysmisTo
                        else:
                            value = self.formatValue(self.fh, self.spssio, varName, rawValue,
                                                     self.printTypeLabels[varName],
                                                     self.varWids[varName], recodeSysmisTo)
                    # string values
                    else:
                        value = self.getValueChar(self.fh, spssGetValueChar,
                                                  self.varHandles[varName])[1].rstrip()

                    record.append(value)

                if debug and i+1 % 100 == 0:        
                    print "record", i+1, record

                yield record
        else:
            try:
                print "Error", retcodes[self.retcode]
            except KeyError:
                print "Unknown error code (%d)" % self.retcode
            finally:
                raise Exception, "You fail!"




def calculateFrequency(sav):
    """ This function returns a frequency count for each variable in
    the spss data file """
    freqs = {}
    for lino, line in enumerate(sav):
        if lino == 0:
            varNames = line
        else:
            for varName in varNames:
                value = line[varNames.index(varName)]
                value = "(missing)" if value == "" else value
                try:
                    freqs[varName]
                except KeyError:
                    freqs[varName] = {}
                try:
                    freqs[varName][value] += 1
                except KeyError:
                    freqs[varName][value] = 1
    return freqs

if __name__ == "__main__":

    help(SavReader)

    import contextlib, csv

    ## ----- Get some basic file info
    savFileName = r"C:\Program Files\SPSS Evaluation\Employee data.sav"

    numVars, nCases, varNames, varTypes, printTypesFile, printTypeLabels, varWids = \
             SavReader(savFileName).readBasicSavFileInfo()[4:]

    ## ----- Typical use    
    with SavReader(savFileName, recodeSysmisTo=999, selectVars=["educ"]) as sav:
        header = sav.next()
        for line in sav:
            pass # do stuff

    ## ----- Convert file to .csv
    csvFileName = "d:/temp/test.csv"
    with contextlib.nested(SavReader(savFileName), open(csvFileName, "wb")) as (sav, f):
        writer = csv.writer(f)
        for line in sav:
            writer.writerow(line)
        print "Done! Csv file written: %s" % f.name

    ## ----- Run frequency counts
    def main(savFileName):
        with SavReader(savFileName, selectVars=["educ", "gender"]) as sav:
            freqs = calculateFrequency(sav)
        for var, values in freqs.iteritems():
            print "\n\n", 10 * "*", var.upper(), 10 * "*"
            for val, freq in values.iteritems():
                print val, "--", freq
    main(savFileName)

Diff to Previous Revision

--- revision 1 2011-04-12 18:09:54
+++ revision 2 2011-04-14 10:09:55
@@ -240,7 +240,7 @@
         fmts = dict([(varName, printTypeLabels[varName].split("_")[-1])
                      for varName in varNames])
         if self.verbose:
-            self.getFileReport(savFileName, varNames, varTypes, fmts,
+            self.getFileReport(self.savFileName, varNames, varTypes, fmts,
                                printDecs, printWids, nCases)
 
         return retcode, self.spssio, self.fh, varHandles, numVars, nCases, varNames, \

History