Welcome, guest | Sign In | My Account | Store | Cart
#!/usr/bin/env python
# -*- coding: cp1252 -*-

""" A Python interface to the IBM SPSS Statistics Input Output Module
(Windows: spssio32.dll)"""


# spssio32.dll and documentation can be downloaded here:
# https://www.ibm.com/developerworks/mydeveloperworks/wikis/home/wiki/We70df3195ec8_4f95_9773_42e448fa9029/page/Downloads%20for%20IBM%C2%AE%20SPSS%C2%AE%20Statistics?lang=en
# TO DO: make this work under Linux. When I tried the .so file,
# I had a hard time finding all the necessary dependencies.
# The .so file versions that are needed are rather old.
# NOTE: If you downloaded this previously, use the current version as it
#       is *MUCH* faster!!
# ANY FEEDBACK ON THIS CODE IS WELCOME: "@".join(["fomcl", "yahoo.com"])

from __future__ import with_statement # only Python 2.5
import sys
import os
import ctypes
import struct
import operator
import math
import locale
import datetime
try:
   
import psyco
    psyco
.full()
except ImportError:
   
print "NOTE. Psyco module not found. Install this module to increase program performance"

__author__  
=  'Albert-Jan Roskam'
__version__
=  '2.0.0'

retcodes
=    {0: 'SPSS_OK',
               
1: 'SPSS_EXC_LEN64',
               
2: 'SPSS_EXC_VARLABEL',
               
3: 'SPSS_FILE_RERROR',
               
4: 'SPSS_EXC_VALLABEL',
               
5: 'SPSS_FILE_END',
               
6: 'SPSS_NO_VARSETS',
               
7: 'SPSS_EMPTY_VARSETS',
               
8: 'SPSS_NO_LABELS',
               
9: 'SPSS_NO_LABEL',
               
10: 'SPSS_NO_CASEWGT',
               
11: 'SPSS_NO_DATEINFO',
               
12: 'SPSS_NO_MULTRESP',
               
13: 'SPSS_EMPTY_MULTRESP',
               
14: 'SPSS_NO_DEW',
               
15: 'SPSS_EMPTY_DEW',
               
16: 'SPSS_SHORTSTR_EXP',
               
17: 'SPSS_INVALID_VARTYPE',
               
18: 'SPSS_INVALID_MISSFOR',
               
19: 'SPSS_INVALID_COMPSW',
               
20: 'SPSS_INVALID_PRFOR',
               
21: 'SPSS_INVALID_WRFOR',
               
22: 'SPSS_INVALID_DATE',
               
23: 'SPSS_INVALID_TIME',
               
24: 'SPSS_NO_VARIABLES',
               
25: 'SPSS_MIXED_TYPES',
               
27: 'SPSS_DUP_VALUE',
               
28: 'SPSS_INVALID_CASEWGT',
               
29: 'SPSS_INCOMPATIBLE_DICT',
               
30: 'SPSS_DICT_COMMIT',
               
31: 'SPSS_DICT_NOTCOMMIT',
               
33: 'SPSS_NO_TYPE2',
               
41: 'SPSS_NO_TYPE73',
               
45: 'SPSS_INVALID_DATEINFO',
               
46: 'SPSS_NO_TYPE999',
               
47: 'SPSS_EXC_STRVALUE',
               
48: 'SPSS_CANNOT_FREE',
               
49: 'SPSS_BUFFER_SHORT',
               
50: 'SPSS_INVALID_CASE',
               
51: 'SPSS_INTERNAL_VLABS',
               
52: 'SPSS_INCOMPAT_APPEND',
               
53: 'SPSS_INTERNAL_D_A',
               
54: 'SPSS_FILE_BADTEMP',
               
55: 'SPSS_DEW_NOFIRST',
               
56: 'SPSS_INVALID_MEASURELEVEL',
               
57: 'SPSS_INVALID_7SUBTYPE',
               
58: 'SPSS_INVALID_VARHANDLE',
               
59: 'SPSS_INVALID_ENCODING',
               
60: 'SPSS_FILES_OPEN',
               
70: 'SPSS_INVALID_MRSETDEF',
               
71: 'SPSS_INVALID_MRSETNAME',
               
72: 'SPSS_DUP_MRSETNAME',
               
73: 'SPSS_BAD_EXTENSION',
               
74: 'SPSS_INVALID_EXTENDEDSTRING',
               
75: 'SPSS_INVALID_ATTRNAME',
               
76: 'SPSS_INVALID_ATTRDEF',
               
77: 'SPSS_INVALID_MRSETINDEX',
               
78: 'SPSS_INVALID_VARSETDEF',
               
79: 'SPSS_INVALID_ROLE'}

printTypes
=  {1: ('SPSS_FMT_A', 'Alphanumeric'),
               
2: ('SPSS_FMT_AHEX', 'Alphanumeric hexadecimal'),
               
3: ('SPSS_FMT_COMMA', 'F Format with commas'),
               
4: ('SPSS_FMT_DOLLAR', 'Commas and floating dollar sign'),
               
5: ('SPSS_FMT_F', 'Default Numeric Format'),
               
6: ('SPSS_FMT_IB', 'Integer binary'),
               
7: ('SPSS_FMT_PIBHEX', 'Positive integer binary - hex'),
               
8: ('SPSS_FMT_P', 'Packed decimal'),
               
9: ('SPSS_FMT_PIB', 'Positive integer binary unsigned'),
               
10: ('SPSS_FMT_PK', 'Positive integer binary unsigned'),
               
11: ('SPSS_FMT_RB', 'Floating point binary'),
               
12: ('SPSS_FMT_RBHEX', 'Floating point binary hex'),
               
15: ('SPSS_FMT_Z', 'Zoned decimal'),
               
16: ('SPSS_FMT_N', 'N Format- unsigned with leading 0s'),
               
17: ('SPSS_FMT_E', 'E Format- with explicit power of 10'),
               
20: ('SPSS_FMT_DATE', 'Date format dd-mmm-yyyy'),
               
21: ('SPSS_FMT_TIME', 'Time format hh:mm:ss.s'),
               
22: ('SPSS_FMT_DATE_TIME', 'Date and Time'),
               
23: ('SPSS_FMT_ADATE', 'Date format dd-mmm-yyyy'),
               
24: ('SPSS_FMT_JDATE', 'Julian date - yyyyddd'),
               
25: ('SPSS_FMT_DTIME', 'Date-time dd hh:mm:ss.s'),
               
26: ('SPSS_FMT_WKDAY', 'Day of the week'),
               
27: ('SPSS_FMT_MONTH', 'Month'),
               
28: ('SPSS_FMT_MOYR', 'mmm yyyy'),
               
29: ('SPSS_FMT_QYR', 'q Q yyyy'),
               
30: ('SPSS_FMT_WKYR', 'ww WK yyyy'),
               
31: ('SPSS_FMT_PCT', 'Percent - F followed by %'),
               
32: ('SPSS_FMT_DOT', 'Like COMMA, switching dot for comma'),
               
33: ('SPSS_FMT_CCA', 'User Programmable currency format'),
               
34: ('SPSS_FMT_CCB', 'User Programmable currency format'),
               
35: ('SPSS_FMT_CCC', 'User Programmable currency format'),
               
36: ('SPSS_FMT_CCD', 'User Programmable currency format'),
               
37: ('SPSS_FMT_CCE', 'User Programmable currency format'),
               
38: ('SPSS_FMT_EDATE','Date in dd/mm/yyyy style'),
               
39: ('SPSS_FMT_SDATE', 'Date in yyyy/mm/dd style')}

class SavReader(object):
   
""" Read Spss system files (.sav)

    Parameters:
    -savFileName: the file name of the spss data file
    -returnHeader: Boolean that indicates whether the first record should
        be a list of variable names (default is True)
    -recodeSysmisTo: indicates to which value missing values should
        be recoded (default = ""),
    -selectVars: indicates which variables in the file should be selected.
        The variables should be specified as a list or a tuple of
        valid variable names. If None is specified, all variables
        in the file are used (default = None)
    -verbose: Boolean that indicates whether information about the spss data file
        (e.g., number of cases, variable names, file size) should be printed on
        the screen (default = True).
    -rawMode: Boolean that indicates whether values should get SPSS-style formatting,
        and whether date variables (if present) should be converted to ISO-dates. If True.
        the program does not format any values, which increases processing speed.
        (default = False)
    -interfaceEncoding: indicates the mode in which text communicated to or from the
        I/O Module will be. Valid values are 'UTF-8' or 'CODEPAGE' (default = 'CODEPAGE')

    Typical use:
    savFileName = "
d:/someFile.sav"
    with SavReader(savFileName) as sav:
        header = sav.next()
        for line in sav:
            process(line)
    """

    def __init__(self, savFileName, returnHeader=True, recodeSysmisTo="",
                 verbose=True, selectVars=None, rawMode=False, interfaceEncoding="CODEPAGE"):
        """ Constructor. Initializes all vars that can be recycled """

        self.savFileName = savFileName
        self.returnHeader = returnHeader
        self.recodeSysmisTo = recodeSysmisTo
        self.verbose = verbose
        self.selectVars = selectVars
        self.rawMode = rawMode
                                         
        self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
        #self.CUT_OFF = self.getSystemSysmisVal(self.spssio)
        self.CUT_OFF = -1 * sys.float_info[0]

        self.numVars = ctypes.c_int()
        self.numVarsPtr = ctypes.byref(self.numVars)
        self.nCases = ctypes.c_long()
        self.numofCasesPtr = ctypes.byref(self.nCases)
       
        self.printType = ctypes.c_int()
        self.printDec = ctypes.c_int()
        self.printWid = ctypes.c_int()
        self.printTypePtr = ctypes.byref(self.printType)
        self.printDecPtr = ctypes.byref(self.printDec)
        self.printWidPtr = ctypes.byref(self.printWid)

        self.attribNames = ctypes.c_char_p()
        self.attribText = ctypes.c_char_p()
        self.nAttributes = ctypes.c_int()
        self.attribNamesPtr = ctypes.byref(self.attribNames)
        self.attribTextPtr = ctypes.byref(self.attribText)
        self.nAttributesPtr = ctypes.byref(self.nAttributes)

        self.numValue = ctypes.c_double()
        self.numValuePtr = ctypes.byref(self.numValue)
        self.assumedCharWid = 200 # hmmmm...
        self.charValue = ctypes.create_string_buffer(self.assumedCharWid)
        self.charValuePtr = ctypes.byref(self.charValue)
        self.valueSize = ctypes.c_int(self.assumedCharWid)

        self.interfaceEncodingIn = interfaceEncoding        
        self.retcode, self.spssio, self.fh, self.numVars_, \
          self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \
          self.printTypeLabels, self.varWids = self._readBasicSavFileInfo()
        self.interfaceEncoding, self.encoding = self.getInterfaceEncoding(self.spssio)
        self.header = self.getHeader(self.selectVars)

    def __enter__(self):
        """ This function opens the spss data file."""
        return self.readSavFile(self.returnHeader, self.recodeSysmisTo,
                                self.selectVars, self.rawMode, self.encoding)

    def __exit__(self, type, value, tb):
        """ This function closes the spss data file."""
        if type is not None:
            pass # Exception occurred
        self.spssio.spssCloseRead(self.fh)
        del self.spssio
       
    def _readBasicSavFileInfo(self):
        """ This function reads and returns some basic information of the open
        spss data file. It returns the following variables:
        retcode: the return code (0 means OK)
        spssio: the spss i/
o C module, opened with ctypes.windll.spssio32
        fh
: the file handle
        numVars
: the number of variables in the spss data file
        nCases
: the number of cases (records) in the spss data file
        varNames
: a list of the var names  in the spss data file
        varTypes
: a dictionary with var names as keys and var types as values
        printTypesFile
: a dictionary with var names as keys and print types as values
        printTypeLabels
: a dictionary with var names as keys and print type labels as values
        varWids
: : a dictionary with var names as keys and var widths as values
        NOT FOR GENERAL USE
; see getSavFileInfo
       
"""
        self.retcode, self.spssio, self.fh = self.loadSavFile(self.savFileName,
                                                              self.interfaceEncodingIn)
        numVars = self.getNumberofVariables(self.fh, self.spssio)[1]
        nCases = self.getNumberofCases(self.fh, self.spssio)[1]
        varNames, varTypes_ = self.getVarInfo(self.fh, self.spssio)
        self.fileEncoding = self.getFileEncoding(self.fh)[1]
        self.fileCodePage = self.getFileCodePage(self.fh)[1]
       
        varTypes, printTypesFile, varWids, printDecs, \
                  printWids = {}, {}, {}, {}, {}
        for i, varName in enumerate(varNames):
            varTypes[varName] = varTypes_[i]
            retcode, printType, printDec, printWid = \
                     self.getVarPrintFormat(self.fh, self.spssio, varName)
            printTypesFile[varName] = printType
            varWids[varName] = printWid
            printDecs[varName] = printDec
            printWids[varName] = printWid
           
        printTypeLabels = dict([(varName,
                                 printTypes[printType][0])
                                for varName, printType in printTypesFile.iteritems()])

        fmts = dict([(varName, printTypeLabels[varName].split("
_")[-1])
                     for varName in varNames])
        if self.verbose:
            print self.getFileReport(self.savFileName, varNames, varTypes, fmts,
                               printDecs, printWids, nCases)

        return retcode, self.spssio, self.fh, numVars, nCases, varNames, \
               varTypes, printTypesFile, printTypeLabels, varWids

    def getSavFileInfo(self):
        """
This function reads and returns some basic information of the open
        spss data file
. Returns numVars, nCases, varNames, varTypes, printTypesFile,
        printTypeLabels
, varWids. Suitable for use without context manager ('with' statement)
       
See also _readBasicSavFileInfo method."""
        return self.numVars_, self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \
          self.printTypeLabels, self.varWids
       
    def loadSavFile(self, savFileName, interfaceEncoding):
        """
This function loads the spss I/O file (.dll or .so file) and opens
        the spss data file
for reading."""
        platform = sys.platform.lower()
        if platform.startswith("
win"):
            try:
                os.environ["
PATH"] += ";" + os.path.abspath(".")
                spssio = ctypes.windll.spssio32
                self.libc = ctypes.cdll.msvcrt
                fopen = self.libc._fdopen # libc.fopen() won't work on windows
            except WindowsError, e:
                msg = "
Cannot find spssio32.dll in '%s'.\n" % os.path.abspath(".") + \
                      "
Py file and Dll should live in the same directory [%s]." % e
                raise Exception, msg
        elif platform.startswith("
linux"):
            # add library search path to LD_LIBRARY_PATH environment variable
            # Type this in the terminal **before** running the program:
            # LD_LIBRARY_PATH=/path/of/additional/sofiles
            # export LD_LIBRARY_PATH
            # also need libirc.so from intel-icc8-libs_8.0-1_i386 (?) but this is broken.
            # or perhaps intel-icc9-libs-9.0-025.i386 (?)
            path = os.path.abspath("
.")
            os.environ["
PATH"] += ":" + path
            libicuuc = ctypes.CDLL("
libicuuc.so.32.0")
            libicudata = ctypes.CDLL("
libicudata.so.32")
            libicu32 = ctypes.CDLL("
libicu.so.32.0") # ??
            spssio = ctypes.CDLL("
%s/libspssdio.so.1" % path)
            self.libc = ctypes.CDLL("
libc.so.6")
            fopen = self.libc.fopen
        else:
            msg = "
Your platform ('%s') is not supported" % platform
            raise NotImplementedError, msg

        self.setInterfaceEncoding(spssio, interfaceEncoding)
       
        if os.path.exists(self.savFileName):
            fh = fopen(self.savFileName, "
rb")
            fhPtr = ctypes.byref(ctypes.c_int(fh))
            retcode = spssio.spssOpenRead(ctypes.c_char_p(self.savFileName), fhPtr)
            return retcode, spssio, fh
        else:
            raise Exception, "
File '%s' does not exist!" % self.savFileName
           
    def getNumberofVariables(self, fh, spssio):
        """
This function reports the number of variables present in a data file."""
        retcode = spssio.spssGetNumberofVariables(fh, self.numVarsPtr)
        return retcode, self.numVars.value

    def getVarNameAndType(self, fh, spssio, iVar):
        """
Get variable name and type. The variable type code is an integer
       
in the range 0-32767, 0 indicating a numeric variable and a positive
        value indicating a
string variable of that size."""
        varNameBuff = ctypes.create_string_buffer(65)
        varNamePtr = ctypes.byref(varNameBuff)
        varType = ctypes.c_int()
        varTypePtr = ctypes.byref(varType)
        retcode = spssio.spssGetVarInfo(fh, iVar, varNamePtr, varTypePtr)
        return varNameBuff.value, varType.value

    def getVarInfo(self, fh, spssio):
        """
This function gets the name and type of one of the variables
        present
in a data file."""
        spssio.spssGetNumberofVariables(fh, self.numVarsPtr)
        varNames, varTypes = [], []
        for iVar in range(self.numVars.value):
            varName, varType = self.getVarNameAndType(fh, spssio, iVar)
            varNames.append(varName)
            varTypes.append(varType)
        return varNames, varTypes

    def getNumberofCases(self, fh, spssio):
        """
This function reports the number of cases present in a data file"""
        retcode = spssio.spssGetNumberofCases(fh, self.numofCasesPtr)
        return retcode, self.nCases.value

    def getVarPrintFormat(self, fh, spssio, variable):
        """
This function reports the print format of a variable. Format
        type
, number of decimal places, and field width are returned. """
        self.varName = ctypes.c_char_p(variable)
        retcode = spssio.spssGetVarPrintFormat(fh,
                                self.varName,
                                self.printTypePtr,
                                self.printDecPtr,
                                self.printWidPtr)
        return retcode, self.printType.value, self.printDec.value, \
               self.printWid.value

    def getSystemSysmisVal(self, spssio):
        """
This function returns the IBM SPSS Statistics system-missing
        value
for the host system."""
        # returns Inf. Function not currently used.
        spssio.spssSysmisVal.restype = ctypes.c_float
        return spssio.spssSysmisVal()

    def formatValue(self, fh, spssio, variable, value, printTypeLabel,
                    varWid, recodeSysmisTo):
        """
This function formats date fields to ISO dates (yyyy-mm-dd), plus
        some other date
/time formats. The SPSS N format is formatted to a
        character value
with leading zeroes."""
        supportedDates = {'SPSS_FMT_DATE':     '%Y-%m-%d',
                          'SPSS_FMT_JDATE':    '%Y-%m-%d',
                          'SPSS_FMT_EDATE':    '%Y-%m-%d',
                          'SPSS_FMT_SDATE':    '%Y-%m-%d',
                          'SPSS_FMT_DATE_TIME':'%Y-%m-%d %H:%M:%S',
                          'SPSS_FMT_WKDAY':    '%A %H:%M:%S',
                          'SPSS_FMT_ADATE':    '%Y-%m-%d',
                          'SPSS_FMT_WKDAY':    '%A',
                          'SPSS_FMT_MONTH':    '%B',
                          'SPSS_FMT_MOYR':     '%B %Y',
                          'SPSS_FMT_WKYR':     '%W WK %Y'}
        if printTypeLabel in supportedDates:
            fmt = supportedDates[printTypeLabel]
            return self.spss2strDate(value, fmt, recodeSysmisTo)
        elif printTypeLabel == 'SPSS_FMT_N':
            value = str(value).zfill(varWid)
            return value
        else:
            return value
   
    def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo):
        """
This function converts internal SPSS dates (number of seconds
        since midnight
, Oct 14, 1582 (the beginning of the Gregorian calendar))
        to a human
-readable format """
        try:
            theDate = self.gregorianEpoch + datetime.timedelta(seconds=spssDateValue)
            return datetime.datetime.strftime(theDate, fmt)
        except TypeError:
            return recodeSysmisTo
        except ValueError:
            return recodeSysmisTo
        except OverflowError:
            return recodeSysmisTo

    def encodeStringValues(self, record, encoding):
        """
This function encodes string values in a record in the encoding
        of the SPSS data file
. """
        encodedRecord = []
        for value in record:
            if isinstance(value, str):
                try:
                    value = value.decode(self.fileEncoding, "
replace").encode(encoding)
                except UnicodeEncodeError:
                    value = value.decode(self.fileEncoding, "
replace").encode("UTF-8")
            encodedRecord.append(value)
        return encodedRecord

    def formatRecord(self, record, recodeSysmisTo):
        """
This function formats the values in a record according to the
        formats given
in the SPSS file dictionary."""
        formattedRecord = []
        for rawValue, varName in zip(record, self.varNames):
            value = recodeSysmisTo if rawValue <= self.CUT_OFF else rawValue
            if self.printTypeLabels[varName] != 'SPSS_FMT_F':
                value = self.formatValue(self.fh, self.spssio, varName, rawValue,
                                         self.printTypeLabels[varName],
                                         self.varWids[varName], recodeSysmisTo)
            formattedRecord.append(value)
        return formattedRecord
   
    def getFileEncoding(self, fh):
        """
This function obtains the encoding applicable to a file.
       
The encoding is returned as an IANA encoding name, such as
        ISO
-8859-1. """
        self.pszEncoding = ctypes.create_string_buffer(20) # is 20 enough??
        self.pszEncodingPtr = ctypes.byref(self.pszEncoding)
        retcode = self.spssio.spssGetFileEncoding(self.fh, self.pszEncodingPtr)
        return retcode, self.pszEncoding.value

    def getFileCodePage(self, fh):
        """
This function provides the Windows code page
        number of the encoding applicable to a file
."""
        self.nCodePage = ctypes.c_int()
        self.nCodePagePtr = ctypes.byref(self.nCodePage)
        retcode = self.spssio.spssGetFileCodePage(self.fh, self.nCodePagePtr)
        return retcode, self.nCodePage.value

    def setInterfaceEncoding(self, spssio, interfaceEncoding):
        """
This function sets the current interface encoding."""
        self.icodes = {"
UTF-8": 0, "CODEPAGE": 1}
        interfaceEncoding = interfaceEncoding.upper()
        if interfaceEncoding not in self.icodes.keys():
            msg = "
Invalid interface encoding ('%s'), valid values are 'UTF-8' or 'CODEPAGE'" % \
                  interfaceEncoding
            raise Exception, msg
        retcode = spssio.spssSetInterfaceEncoding(ctypes.c_int(self.icodes[interfaceEncoding]))
        return retcode
   
    def getInterfaceEncoding(self, spssio):
        """
This function returns the current interface encoding.
       
('UTF-8' or 'CODEPAGE') and the specific current codepage (e.g. cp1252)"""
        swapped = dict(zip(self.icodes.values(), self.icodes.keys()))
        interfaceEncoding = swapped[spssio.spssGetInterfaceEncoding()]
        encoding = locale.getpreferredencoding()if interfaceEncoding == "
CODEPAGE" else "UTF-8"
        return interfaceEncoding, encoding

    def getFileReport(self, savFileName, varNames, varTypes, fmts, printDecs,
                      printWids, nCases):
        """
This function prints a report about basic file characteristics """
        bytes = os.path.getsize(savFileName)
        kb = float(bytes) / 2**10
        mb = float(bytes) / 2**20
        (fileSize, label) = (mb, "
MB") if mb > 1 else (kb, "kB")
        line1 = [os.linesep + "
*" * 70]
        line2 = ["
*File '%s' (%5.2f %s) has %s columns (variables) and %s rows (%s values)" % \
              (savFileName, fileSize, label, len(varNames), nCases, len(varNames) * nCases)]
        line3 = ["
*The file encoding is: %s (Code Page: %s)" % (self.fileEncoding, self.fileCodePage)]
        loc, cp = locale.getlocale()
        line4 = ["
*Your computer's locale is: %s (Code page: %s)" % (loc, cp)]
        line5 = ["*The file contains the following variables:"]
        lines = []
        for cnt, varName in enumerate(varNames):
            label = "string" if varTypes[varName] > 0 else "numerical"
            lines.append("%03d. %s (%s%d.%d - %s)" % (cnt+1, varName, fmts[varName], \
                                                      printWids[varName], printDecs[varName], label))
        lineN = ["*" * 70]
        report = os.linesep.join(line1 + line2 + line3 + line4 + line5 + lines + lineN)
        return report

    def conversionFormatCtoPy(self, varNames, varTypes):
        """ This function generates a struct format string for the conversion
        between C and Python values. SPSS data files are assumed to have either
        8-byte doubles/floats or n-byte chars[]/strings, where n is always
        8 bytes or a multiple thereof."""
        structFmt = ""
        if sys.byteorder == "little":
            endianness = "<"
        elif sys.byteorder == "big":
            endianness = ">"
        else:
            endianness = "@"
        structFmt += endianness
        for v in varNames:
            if varTypes[v] == 0:
                structFmt += "d"
            else:
                fmt = str(int(math.ceil(varTypes[v] / 8.0) * 8))
                structFmt += fmt + "s"
        return structFmt

    def getCaseBuffer(self):
        """ This function returns a buffer and a pointer to that buffer. A whole
        case will be read into this buffer."""
        self.caseSize = ctypes.c_long()
        self.caseSizePtr = ctypes.byref(self.caseSize)
        self.retcode = self.spssio.spssGetCaseSize(self.fh, self.caseSizePtr)
        self.caseBuffer = ctypes.create_string_buffer(self.caseSize.value)
        self.caseBufferPtr = ctypes.byref(self.caseBuffer)
        return self.caseBuffer, self.caseBufferPtr

    def getHeader(self, selectVars):
        if selectVars is None:
            header = self.varNames
        elif isinstance(selectVars, (list, tuple)):
            diff = set(selectVars).difference(set(self.varNames))
            if diff:
                msg = "Variable names misspecified ('
%s')" % ", ".join(diff)
                raise Exception, msg
            varPos = [self.varNames.index(v) for v in self.varNames if v in selectVars]
            self.selector = operator.itemgetter(*varPos)
            header = self.selector(self.varNames)
            header = [header] if not isinstance(header, tuple) else header
        else:
            raise Exception, "Variable names list misspecified. " + \
                  "Must be '
None' or a list or tuple of existing variables"    
        return header

    def readSavFile(self, returnHeader, recodeSysmisTo, selectVars, rawMode, encoding):
        """ This is the main function of this class. It is a generator, which
        returns one record of the spss data file at a time. """
       
        debug = False
        if retcodes[self.retcode] == "SPSS_OK":
            if returnHeader:
               yield self.header

            # avoiding dots inside the loops
            # http://wiki.python.org/moin/PythonSpeed/PerformanceTips#Avoiding_dots...
            containsStringvars = max([varType for varName, varType in self.varTypes.items()
                                      if varName in self.header]) > 0
            self.caseBuffer, self.caseBufferPtr = self.getCaseBuffer()
            structFmt = self.conversionFormatCtoPy(self.varNames, self.varTypes)
            unpack = struct.unpack
            wholeCaseIn = self.spssio.spssWholeCaseIn
            print "Pct progress ...",
            for case in range(self.nCases_):
                retcode = wholeCaseIn(self.fh, self.caseBufferPtr)
                if retcodes[retcode] != '
SPSS_OK':
                    print "WARNING: Record %s is faulty" % case+1
                    continue
                record = unpack(structFmt, self.caseBuffer.raw)
                if selectVars is not None:
                    record = self.selector(record)
                    record = [record] if not isinstance(record, tuple) else record
                if containsStringvars:
                    record = self.encodeStringValues(record, encoding)
                if not rawMode:
                    record = self.formatRecord(record, self.recodeSysmisTo)
                if debug and (case+1) % 10 == 0:        
                    print "record", case+1, record
                pctProgress = (float(case) / self.nCases_) * 100
                if pctProgress % 5 == 0:
                    print "%2.1f%%... " % pctProgress,
                yield record
        else:
            try:
                print "Error", retcodes[retcode]
            except KeyError:
                print "Unknown error code (%d)" % retcode
            finally:
                raise Exception, "You fail!"

def calculateFrequency(sav):
    """ This function returns a frequency count for each variable in
    the spss data file """
    freqs = {}
    for lino, line in enumerate(sav):
        if lino == 0:
            varNames = line
        else:
            for varName in varNames:
                value = line[varNames.index(varName)]
                value = "(missing)" if value == "" else value
                try:
                    freqs[varName]
                except KeyError:
                    freqs[varName] = {}
                try:
                    freqs[varName][value] += 1
                except KeyError:
                    freqs[varName][value] = 1
    return freqs

if __name__ == "__main__":

    help(SavReader)

    import contextlib, csv

    ## ----- Get some basic file info
    savFileName = r"C:\Program Files\IBM\SPSS\Statistics\19\Samples\English\Employee data.sav"
    numVars, nCases, varNames, varTypes, printTypesFile, printTypeLabels, varWids = \
             SavReader(savFileName).getSavFileInfo()


    ## ----- Typical use    
    with SavReader(savFileName, selectVars=['
id'], recodeSysmisTo=999) as sav:
        header = sav.next()
        for line in sav:
            pass # do stuff

    ## ----- Convert file to .csv
    with contextlib.nested(SavReader(savFileName, selectVars=None, verbose=True,
                                     rawMode=False, interfaceEncoding="UTF-8"),
                           open(csvFileName, "wb")) as (sav, f):
        writer = csv.writer(f)
        for line in sav:
            writer.writerow(line)
        print "Done! Csv file written: %s" % f.name

    ## ----- Run frequency counts
    def main(savFileName=savFileName):
        with SavReader(savFileName) as sav:
            freqs = calculateFrequency(sav)
        for var, values in freqs.iteritems():
            print "\n\n", 10 * "*", var.upper(), 10 * "*"
            for val, freq in values.iteritems():
                print val, "--", freq
    #main(savFileName)

Diff to Previous Revision

--- revision 2 2011-04-14 10:09:55
+++ revision 3 2011-07-25 18:59:49
@@ -9,19 +9,27 @@
 
# TO DO: make this work under Linux. When I tried the .so file,
 
# I had a hard time finding all the necessary dependencies.
 
# The .so file versions that are needed are rather old.
-# NOTE: This is rather slow in processing huge files.
+# NOTE: If you downloaded this previously, use the current version as it
+#       is *MUCH* faster!!
 
# ANY FEEDBACK ON THIS CODE IS WELCOME: "@".join(["fomcl", "yahoo.com"])
 
 
from __future__ import with_statement # only Python 2.5
-import sys, os, ctypes, datetime
+import sys
+import os
+import ctypes
+import struct
+import operator
+import math
+import locale
+import datetime
 
try:
     
import psyco
     psyco
.full()
 
except ImportError:
-    pass
+    print "NOTE. Psyco module not found. Install this module to increase program performance"
 
 __author__  
=  'Albert-Jan Roskam'
-__version__ =  '1.0.0'
+__version__ =  '2.0.0'
 
 retcodes
=    {0: 'SPSS_OK',
               
1: 'SPSS_EXC_LEN64',
@@ -131,10 +139,16 @@
     
-selectVars: indicates which variables in the file should be selected.
         
The variables should be specified as a list or a tuple of
         valid variable names
. If None is specified, all variables
-        in the file are used (default is None)
+        in the file are used (default = None)
     
-verbose: Boolean that indicates whether information about the spss data file
         
(e.g., number of cases, variable names, file size) should be printed on
-        the screen (default: True).            
+        the screen (default = True).
+    -rawMode: Boolean that indicates whether values should get SPSS-style formatting,
+        and whether date variables (if present) should be converted to ISO-dates. If True.
+        the program does not format any values, which increases processing speed.
+        (default = False)
+    -interfaceEncoding: indicates the mode in which text communicated to or from the
+        I/O Module will be. Valid values are 'UTF-8' or 'CODEPAGE' (default = 'CODEPAGE')
 
     
Typical use:
     savFileName
= "d:/someFile.sav"
@@ -145,21 +159,22 @@
     
"""
 
     def __init__(self, savFileName, returnHeader=True, recodeSysmisTo="",
-                 selectVars=None, verbose=True):
+                 verbose=True, selectVars=None, rawMode=False, interfaceEncoding="
CODEPAGE"):
         """
Constructor. Initializes all vars that can be recycled """
 
         self.savFileName = savFileName
         self.returnHeader = returnHeader
         self.recodeSysmisTo = recodeSysmisTo
+        self.verbose = verbose
         self.selectVars = selectVars
-        self.verbose = verbose
+        self.rawMode = rawMode
                                           
-        self.CUT_OFF = 10**-10 # values below cut-off are considered sysmis    
         self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
+        #self.CUT_OFF = self.getSystemSysmisVal(self.spssio)
+        self.CUT_OFF = -1 * sys.float_info[0]
 
         self.numVars = ctypes.c_int()
         self.numVarsPtr = ctypes.byref(self.numVars)
-
         self.nCases = ctypes.c_long()
         self.numofCasesPtr = ctypes.byref(self.nCases)
         
@@ -179,33 +194,36 @@
 
         self.numValue = ctypes.c_double()
         self.numValuePtr = ctypes.byref(self.numValue)
-        self.assumedCharWid = 200
+        self.assumedCharWid = 200 # hmmmm...
         self.charValue = ctypes.create_string_buffer(self.assumedCharWid)
         self.charValuePtr = ctypes.byref(self.charValue)
         self.valueSize = ctypes.c_int(self.assumedCharWid)
 
-        self.retcode, self.spssio, self.fh, self.varHandles, self.numVars_, \
+        self.interfaceEncodingIn = interfaceEncoding        
+        self.retcode, self.spssio, self.fh, self.numVars_, \
           self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \
-          self.printTypeLabels, self.varWids = self.readBasicSavFileInfo()
+          self.printTypeLabels, self.varWids = self._readBasicSavFileInfo()
+        self.interfaceEncoding, self.encoding = self.getInterfaceEncoding(self.spssio)
+        self.header = self.getHeader(self.selectVars)
 
     def __enter__(self):
         """
This function opens the spss data file."""
         return self.readSavFile(self.returnHeader, self.recodeSysmisTo,
-                                self.selectVars)
+                                self.selectVars, self.rawMode, self.encoding)
 
     def __exit__(self, type, value, tb):
         """
This function closes the spss data file."""
         if type is not None:
             pass # Exception occurred
         self.spssio.spssCloseRead(self.fh)
+        del self.spssio
         
-    def readBasicSavFileInfo(self):
+    def _readBasicSavFileInfo(self):
         """
This function reads and returns some basic information of the open
         spss data file
. It returns the following variables:
         retcode
: the return code (0 means OK)
         spssio
: the spss i/o C module, opened with ctypes.windll.spssio32
         fh
: the file handle
-        varHandles: a dictionary with var names as keys and var handles as values
         numVars
: the number of variables in the spss data file
         nCases
: the number of cases (records) in the spss data file
         varNames
: a list of the var names  in the spss data file
@@ -213,23 +231,24 @@
         printTypesFile
: a dictionary with var names as keys and print types as values
         printTypeLabels
: a dictionary with var names as keys and print type labels as values
         varWids
: : a dictionary with var names as keys and var widths as values
+        NOT FOR GENERAL USE; see getSavFileInfo
         
"""
-        self.retcode, self.spssio, self.fh = self.loadSavFile(self.savFileName)
+        self.retcode, self.spssio, self.fh = self.loadSavFile(self.savFileName,
+                                                              self.interfaceEncodingIn)
         numVars = self.getNumberofVariables(self.fh, self.spssio)[1]
         nCases = self.getNumberofCases(self.fh, self.spssio)[1]
         varNames, varTypes_ = self.getVarInfo(self.fh, self.spssio)
-
-        varTypes, printTypesFile, varWids, varHandles, printDecs, \
-                  printWids = {}, {}, {}, {}, {}, {}
+        self.fileEncoding = self.getFileEncoding(self.fh)[1]
+        self.fileCodePage = self.getFileCodePage(self.fh)[1]
+        
+        varTypes, printTypesFile, varWids, printDecs, \
+                  printWids = {}, {}, {}, {}, {}
         for i, varName in enumerate(varNames):
             varTypes[varName] = varTypes_[i]
             retcode, printType, printDec, printWid = \
-                     self.getVarPrintFormat(self.fh, self.spssio,
-                                            variable=varName)
+                     self.getVarPrintFormat(self.fh, self.spssio, varName)
             printTypesFile[varName] = printType
             varWids[varName] = printWid
-            varHandles[varName] = self.getVarHandle(self.fh, self.spssio,
-                                                    variable=varName)[1]
             printDecs[varName] = printDec
             printWids[varName] = printWid
             
@@ -240,40 +259,62 @@
         fmts = dict([(varName, printTypeLabels[varName].split("
_")[-1])
                      for varName in varNames])
         if self.verbose:
-            self.getFileReport(self.savFileName, varNames, varTypes, fmts,
+            print self.getFileReport(self.savFileName, varNames, varTypes, fmts,
                                printDecs, printWids, nCases)
 
-        return retcode, self.spssio, self.fh, varHandles, numVars, nCases, varNames, \
+        return retcode, self.spssio, self.fh, numVars, nCases, varNames, \
                varTypes, printTypesFile, printTypeLabels, varWids
-    
-    def loadSavFile(self, savFileName):
+
+    def getSavFileInfo(self):
+        """
This function reads and returns some basic information of the open
+        spss data file. Returns numVars, nCases, varNames, varTypes, printTypesFile,
+        printTypeLabels, varWids. Suitable for use without context manager ('with' statement)
+        See also _readBasicSavFileInfo method."""
+        return self.numVars_, self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \
+          self.printTypeLabels, self.varWids
+        
+    def loadSavFile(self, savFileName, interfaceEncoding):
         """
This function loads the spss I/O file (.dll or .so file) and opens
         the spss data file
for reading."""
-        if sys.platform.lower().startswith("
win"):
+        platform = sys.platform.lower()
+        if platform.startswith("
win"):
             try:
                 os.environ["
PATH"] += ";" + os.path.abspath(".")
-                ctypes.cdll.LoadLibrary("
spssio32.dll")
                 spssio = ctypes.windll.spssio32
-                libc = ctypes.cdll.msvcrt
+                self.libc = ctypes.cdll.msvcrt
+                fopen = self.libc._fdopen # libc.fopen() won't work on windows
             except WindowsError, e:
                 msg = "
Cannot find spssio32.dll in '%s'.\n" % os.path.abspath(".") + \
                       "
Py file and Dll should live in the same directory [%s]." % e
                 raise Exception, msg
-                
-    ##    elif sys.platform.lower().startswith("
linux"):
-    ##        os.environ["
PATH"] += ":" + os.path.abspath(".")
-    ##        ctypes.cdll.LoadLibrary("
libspssdio.so.1")
-    ##        spssio = ctypes.CDLL("
libspssdio.so")
-    ##        ctypes.cdll.LoadLibrary("
libc.so.6")
-    ##        libc = ctypes.CDLL("
libc.so.6")
-
+        elif platform.startswith("
linux"):
+            # add library search path to LD_LIBRARY_PATH environment variable
+            # Type this in the terminal **before** running the program:
+            # LD_LIBRARY_PATH=/path/of/additional/sofiles
+            # export LD_LIBRARY_PATH
+            # also need libirc.so from intel-icc8-libs_8.0-1_i386 (?) but this is broken.
+            # or perhaps intel-icc9-libs-9.0-025.i386 (?)
+            path = os.path.abspath("
.")
+            os.environ["
PATH"] += ":" + path
+            libicuuc = ctypes.CDLL("
libicuuc.so.32.0")
+            libicudata = ctypes.CDLL("
libicudata.so.32")
+            libicu32 = ctypes.CDLL("
libicu.so.32.0") # ??
+            spssio = ctypes.CDLL("
%s/libspssdio.so.1" % path)
+            self.libc = ctypes.CDLL("
libc.so.6")
+            fopen = self.libc.fopen
+        else:
+            msg = "
Your platform ('%s') is not supported" % platform
+            raise NotImplementedError, msg
+
+        self.setInterfaceEncoding(spssio, interfaceEncoding)
+        
         if os.path.exists(self.savFileName):
-            fh = libc._fdopen(self.savFileName, "
rb") # fopen() on linux
+            fh = fopen(self.savFileName, "
rb")
             fhPtr = ctypes.byref(ctypes.c_int(fh))
             retcode = spssio.spssOpenRead(ctypes.c_char_p(self.savFileName), fhPtr)
             return retcode, spssio, fh
         else:
-            raise Exception, "
File '%s' does not exist!" % fn
+            raise Exception, "
File '%s' does not exist!" % self.savFileName
             
     def getNumberofVariables(self, fh, spssio):
         """
This function reports the number of variables present in a data file."""
@@ -282,7 +323,7 @@
 
     def getVarNameAndType(self, fh, spssio, iVar):
         """
Get variable name and type. The variable type code is an integer
-        in the range 0–32767, 0 indicating a numeric variable and a positive
+        in the range 0-32767, 0 indicating a numeric variable and a positive
         value indicating a
string variable of that size."""
         varNameBuff = ctypes.create_string_buffer(65)
         varNamePtr = ctypes.byref(varNameBuff)
@@ -306,39 +347,6 @@
         """
This function reports the number of cases present in a data file"""
         retcode = spssio.spssGetNumberofCases(fh, self.numofCasesPtr)
         return retcode, self.nCases.value
-
-    def getVarHandle(self, fh, spssio, variable):
-        """
This function returns a handle for a variable, which can then be
-        used to read values of the variable."""
-        self.varName = ctypes.c_char_p(variable)
-        self.varHandle = ctypes.c_double()
-        self.varHandlePtr = ctypes.byref(self.varHandle)
-        retcode = spssio.spssGetVarHandle(fh, self.varName, self.varHandlePtr)
-        return retcode, self.varHandle
-
-    def getVarAttributes(self, fh, spssio, variable):
-        """
This function returns all the attributes for a single variable."""
-        retcode = spssio.spssGetVarAttributes(fh,
-                                              self.varName,
-                                              self.attribNamesPtr,
-                                              self.attribTextPtr,
-                                              self.nAttributesPtr)
-        return retcode, self.attribNames.value, self.attribText.value, \
-               self.nAttributes.value
-
-    def getValueNumeric(self, fh, c_func, varHandle):
-        """
This function gets the value of a numeric variable for the current
-        case, which is the case read by the most recent call to
-        spssReadCaseRecord."""
-        retcode = c_func(fh, varHandle, self.numValuePtr)
-        return retcode, self.numValue.value
-
-    def getValueChar(self, fh, c_func, varHandle):
-        """
This function gets the value of a string variable for the current
-        case, which is the case read by the most recent call to
-        spssReadCaseRecord."""
-        retcode = c_func(fh, varHandle, self.charValuePtr, self.valueSize)
-        return retcode, self.charValue.value
 
     def getVarPrintFormat(self, fh, spssio, variable):
         """
This function reports the print format of a variable. Format
@@ -352,8 +360,12 @@
         
return retcode, self.printType.value, self.printDec.value, \
               
self.printWid.value
 
-    ##def getSystemSysmisVal(spssio):
-    ##    return spssio.spssSysmisVal()
+    def getSystemSysmisVal(self, spssio):
+        """This function returns the IBM SPSS Statistics system-missing
+        value for the host system."""

+        # returns Inf. Function not currently used.
+        spssio.spssSysmisVal.restype = ctypes.c_float
+        return spssio.spssSysmisVal()
 
     
def formatValue(self, fh, spssio, variable, value, printTypeLabel,
                     varWid
, recodeSysmisTo):
@@ -371,7 +383,6 @@
                           
'SPSS_FMT_MONTH':    '%B',
                           
'SPSS_FMT_MOYR':     '%B %Y',
                           
'SPSS_FMT_WKYR':     '%W WK %Y'}
-        value = recodeSysmisTo if value < self.CUT_OFF else value
         
if printTypeLabel in supportedDates:
             fmt
= supportedDates[printTypeLabel]
             
return self.spss2strDate(value, fmt, recodeSysmisTo)
@@ -395,6 +406,68 @@
         
except OverflowError:
             
return recodeSysmisTo
 
+    def encodeStringValues(self, record, encoding):
+        """ This function encodes string values in a record in the encoding
+        of the SPSS data file. """

+        encodedRecord = []
+        for value in record:
+            if isinstance(value, str):
+                try:
+                    value = value.decode(self.fileEncoding, "replace").encode(encoding)
+                except UnicodeEncodeError:
+                    value = value.decode(self.fileEncoding, "replace").encode("UTF-8")
+            encodedRecord.append(value)
+        return encodedRecord
+
+    def formatRecord(self, record, recodeSysmisTo):
+        """ This function formats the values in a record according to the
+        formats given in the SPSS file dictionary."""

+        formattedRecord = []
+        for rawValue, varName in zip(record, self.varNames):
+            value = recodeSysmisTo if rawValue <= self.CUT_OFF else rawValue
+            if self.printTypeLabels[varName] != 'SPSS_FMT_F':
+                value = self.formatValue(self.fh, self.spssio, varName, rawValue,
+                                         self.printTypeLabels[varName],
+                                         self.varWids[varName], recodeSysmisTo)
+            formattedRecord.append(value)
+        return formattedRecord
+    
+    def getFileEncoding(self, fh):
+        """This function obtains the encoding applicable to a file.
+        The encoding is returned as an IANA encoding name, such as
+        ISO-8859-1. """

+        self.pszEncoding = ctypes.create_string_buffer(20) # is 20 enough??
+        self.pszEncodingPtr = ctypes.byref(self.pszEncoding)
+        retcode = self.spssio.spssGetFileEncoding(self.fh, self.pszEncodingPtr)
+        return retcode, self.pszEncoding.value
+
+    def getFileCodePage(self, fh):
+        """This function provides the Windows code page
+        number of the encoding applicable to a file."""

+        self.nCodePage = ctypes.c_int()
+        self.nCodePagePtr = ctypes.byref(self.nCodePage)
+        retcode = self.spssio.spssGetFileCodePage(self.fh, self.nCodePagePtr)
+        return retcode, self.nCodePage.value
+
+    def setInterfaceEncoding(self, spssio, interfaceEncoding):
+        """This function sets the current interface encoding."""
+        self.icodes = {"UTF-8": 0, "CODEPAGE": 1}
+        interfaceEncoding = interfaceEncoding.upper()
+        if interfaceEncoding not in self.icodes.keys():
+            msg = "Invalid interface encoding ('%s'), valid values are 'UTF-8' or 'CODEPAGE'" % \
+                  interfaceEncoding
+            raise Exception, msg
+        retcode = spssio.spssSetInterfaceEncoding(ctypes.c_int(self.icodes[interfaceEncoding]))
+        return retcode
+    
+    def getInterfaceEncoding(self, spssio):
+        """This function returns the current interface encoding.
+        ('UTF-8' or 'CODEPAGE') and the specific current codepage (e.g. cp1252)"""

+        swapped = dict(zip(self.icodes.values(), self.icodes.keys()))
+        interfaceEncoding = swapped[spssio.spssGetInterfaceEncoding()]
+        encoding = locale.getpreferredencoding()if interfaceEncoding == "CODEPAGE" else "UTF-8"
+        return interfaceEncoding, encoding
+
     
def getFileReport(self, savFileName, varNames, varTypes, fmts, printDecs,
                       printWids
, nCases):
         
""" This function prints a report about basic file characteristics """
@@ -402,76 +475,114 @@
         kb
= float(bytes) / 2**10
         mb
= float(bytes) / 2**20
         
(fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB")
-        print "*" * 70
-        print "*File '%s' (%5.2f %s) has %s columns (variables) and %s rows (%s values)" % \
-              (savFileName, fileSize, label, len(varNames), nCases, len(varNames) * nCases)
-        print "*It contains the following variables:"
+        line1 = [os.linesep + "*" * 70]
+        line2 = ["*File '%s' (%5.2f %s) has %s columns (variables) and %s rows (%s values)" % \
+              (savFileName, fileSize, label, len(varNames), nCases, len(varNames) * nCases)]
+        line3 = ["*The file encoding is: %s (Code Page: %s)" % (self.fileEncoding, self.fileCodePage)]
+        loc, cp = locale.getlocale()
+        line4 = ["*Your computer's locale is: %s (Code page: %s)" % (loc, cp)]
+        line5 = ["*The file contains the following variables:"]
+        lines = []
         
for cnt, varName in enumerate(varNames):
             label
= "string" if varTypes[varName] > 0 else "numerical"
-            print "%03d. %s (%s%d.%d - %s)" \
-                  % (cnt+1, varName, fmts[varName], printWids[varName], printDecs[varName], label)
-        print "*" * 70
-
-    def readSavFile(self, returnHeader=True, recodeSysmisTo="", selectVars=None):
+            lines.append("%03d. %s (%s%d.%d - %s)" % (cnt+1, varName, fmts[varName], \
+                                                      printWids[varName], printDecs[varName], label))
+        lineN = ["*" * 70]
+        report = os.linesep.join(line1 + line2 + line3 + line4 + line5 + lines + lineN)
+        return report
+
+    def conversionFormatCtoPy(self, varNames, varTypes):
+        """ This function generates a struct format string for the conversion
+        between C and Python values. SPSS data files are assumed to have either
+        8-byte doubles/floats or n-byte chars[]/strings, where n is always
+        8 bytes or a multiple thereof."""

+        structFmt = ""
+        if sys.byteorder == "little":
+            endianness = "<"
+        elif sys.byteorder == "big":
+            endianness = ">"
+        else:
+            endianness = "@"
+        structFmt += endianness
+        for v in varNames:
+            if varTypes[v] == 0:
+                structFmt += "d"
+            else:
+                fmt = str(int(math.ceil(varTypes[v] / 8.0) * 8))
+                structFmt += fmt + "s"
+        return structFmt
+
+    def getCaseBuffer(self):
+        """ This function returns a buffer and a pointer to that buffer. A whole
+        case will be read into this buffer."""

+        self.caseSize = ctypes.c_long()
+        self.caseSizePtr = ctypes.byref(self.caseSize)
+        self.retcode = self.spssio.spssGetCaseSize(self.fh, self.caseSizePtr)
+        self.caseBuffer = ctypes.create_string_buffer(self.caseSize.value)
+        self.caseBufferPtr = ctypes.byref(self.caseBuffer)
+        return self.caseBuffer, self.caseBufferPtr
+
+    def getHeader(self, selectVars):
+        if selectVars is None:
+            header = self.varNames
+        elif isinstance(selectVars, (list, tuple)):
+            diff = set(selectVars).difference(set(self.varNames))
+            if diff:
+                msg = "Variable names misspecified ('%s')" % ", ".join(diff)
+                raise Exception, msg
+            varPos = [self.varNames.index(v) for v in self.varNames if v in selectVars]
+            self.selector = operator.itemgetter(*varPos)
+            header = self.selector(self.varNames)
+            header = [header] if not isinstance(header, tuple) else header
+        else:
+            raise Exception, "Variable names list misspecified. " + \
+                  "Must be 'None' or a list or tuple of existing variables"    
+        return header
+
+    def readSavFile(self, returnHeader, recodeSysmisTo, selectVars, rawMode, encoding):
         
""" This is the main function of this class. It is a generator, which
         returns one record of the spss data file at a time. """

         
         debug
= False
         
if retcodes[self.retcode] == "SPSS_OK":
-            if not isinstance(selectVars, (list, tuple)) and selectVars is not None:
-                raise Exception, "Variable names list misspecified." + \
-                      "Must be 'None' or a list or tuple of existing variables"                    
-            if selectVars is not None:
-                if set(selectVars).intersection(set(self.varNames)):
-                    self.varNames = selectVars
-                else:
-                    raise Exception, "Variable names list misspecified"
-
             
if returnHeader:
-                yield self.varNames
+               yield self.header
 
             
# avoiding dots inside the loops
             
# http://wiki.python.org/moin/PythonSpeed/PerformanceTips#Avoiding_dots...
-            readCaseRecord = self.spssio.spssReadCaseRecord
-            spssGetValueNumeric = self.spssio.spssGetValueNumeric
-            spssGetValueChar = self.spssio.spssGetValueChar
-            for i in range(self.nCases_):
-                readCaseRecord(self.fh)
-                record = []
-                for varName in self.varNames:
-                    # numerical values
-                    if self.varTypes[varName] == 0:
-                        rawValue = self.getValueNumeric(self.fh, spssGetValueNumeric,
-                                                        self.varHandles[varName])[1]
-                        if self.printTypeLabels[varName] == 'SPSS_FMT_F':
-                            # this is assumed to be the most common format, by far.
-                            # there will be no need to call the (expensive) formatValue function
-                            value = rawValue if rawValue > self.CUT_OFF else recodeSysmisTo
-                        else:
-                            value = self.formatValue(self.fh, self.spssio, varName, rawValue,
-                                                     self.printTypeLabels[varName],
-                                                     self.varWids[varName], recodeSysmisTo)
-                    # string values
-                    else:
-                        value = self.getValueChar(self.fh, spssGetValueChar,
-                                                  self.varHandles[varName])[1].rstrip()
-
-                    record.append(value)
-
-                if debug and i+1 % 100 == 0:        
-                    print "record", i+1, record
-
+            containsStringvars = max([varType for varName, varType in self.varTypes.items()
+                                      if varName in self.header]) > 0
+            self.caseBuffer, self.caseBufferPtr = self.getCaseBuffer()
+            structFmt = self.conversionFormatCtoPy(self.varNames, self.varTypes)
+            unpack = struct.unpack
+            wholeCaseIn = self.spssio.spssWholeCaseIn
+            print "Pct progress ...",
+            for case in range(self.nCases_):
+                retcode = wholeCaseIn(self.fh, self.caseBufferPtr)
+                if retcodes[retcode] != 'SPSS_OK':
+                    print "WARNING: Record %s is faulty" % case+1
+                    continue
+                record = unpack(structFmt, self.caseBuffer.raw)
+                if selectVars is not None:
+                    record = self.selector(record)
+                    record = [record] if not isinstance(record, tuple) else record
+                if containsStringvars:
+                    record = self.encodeStringValues(record, encoding)
+                if not rawMode:
+                    record = self.formatRecord(record, self.recodeSysmisTo)
+                if debug and (case+1) % 10 == 0:        
+                    print "record", case+1, record
+                pctProgress = (float(case) / self.nCases_) * 100
+                if pctProgress % 5 == 0:
+                    print "%2.1f%%... " % pctProgress,
                 
yield record
-        else:
+        else:
             
try:
-                print "Error", retcodes[self.retcode]
+                print "Error", retcodes[retcode]
             
except KeyError:
-                print "Unknown error code (%d)" % self.retcode
+                print "Unknown error code (%d)" % retcode
             
finally:
                 
raise Exception, "You fail!"
-
-
-
 
 
def calculateFrequency(sav):
     
""" This function returns a frequency count for each variable in
@@ -501,31 +612,32 @@
     import contextlib, csv
 
     ## ----- Get some basic file info
-    savFileName = r"
C:\Program Files\SPSS Evaluation\Employee data.sav"
-
+    savFileName = r"
C:\Program Files\IBM\SPSS\Statistics\19\Samples\English\Employee data.sav"
     numVars, nCases, varNames, varTypes, printTypesFile, printTypeLabels, varWids = \
-             SavReader(savFileName).readBasicSavFileInfo()[4:]
+             SavReader(savFileName).getSavFileInfo()
+
 
     ## ----- Typical use    
-    with SavReader(savFileName, recodeSysmisTo=999, selectVars=["
educ"]) as sav:
+    with SavReader(savFileName, selectVars=['id'], recodeSysmisTo=999) as sav:
         header = sav.next()
         for line in sav:
             pass # do stuff
 
     ## ----- Convert file to .csv
-    csvFileName = "
d:/temp/test.csv"
-    with contextlib.nested(SavReader(savFileName), open(csvFileName, "
wb")) as (sav, f):
+    with contextlib.nested(SavReader(savFileName, selectVars=None, verbose=True,
+                                     rawMode=False, interfaceEncoding="
UTF-8"),
+                           open(csvFileName, "
wb")) as (sav, f):
         writer = csv.writer(f)
         for line in sav:
             writer.writerow(line)
         print "
Done! Csv file written: %s" % f.name
 
     ## ----- Run frequency counts
-    def main(savFileName):
-        with SavReader(savFileName, selectVars=["
educ", "gender"]) as sav:
+    def main(savFileName=savFileName):
+        with SavReader(savFileName) as sav:
             freqs = calculateFrequency(sav)
         for var, values in freqs.iteritems():
             print "
\n\n", 10 * "*", var.upper(), 10 * "*"
             for val, freq in values.iteritems():
                 print val, "
--", freq
-    main(savFileName)
+    #main(savFileName)

History