#!/usr/bin/env python # -*- coding: cp1252 -*- """ A Python interface to the IBM SPSS Statistics Input Output Module (Windows: spssio32.dll)""" # spssio32.dll and documentation can be downloaded here: # https://www.ibm.com/developerworks/mydeveloperworks/wikis/home/wiki/We70df3195ec8_4f95_9773_42e448fa9029/page/Downloads%20for%20IBM%C2%AE%20SPSS%C2%AE%20Statistics?lang=en # TO DO: make this work under Linux. When I tried the .so file, # I had a hard time finding all the necessary dependencies. # The .so file versions that are needed are rather old. # NOTE: This is rather slow in processing huge files. # ANY FEEDBACK ON THIS CODE IS WELCOME: "@".join(["fomcl", "yahoo.com"]) from __future__ import with_statement # only Python 2.5 import sys, os, ctypes, datetime try: import psyco psyco.full() except ImportError: pass __author__ = 'Albert-Jan Roskam' __version__ = '1.0.0' retcodes = {0: 'SPSS_OK', 1: 'SPSS_EXC_LEN64', 2: 'SPSS_EXC_VARLABEL', 3: 'SPSS_FILE_RERROR', 4: 'SPSS_EXC_VALLABEL', 5: 'SPSS_FILE_END', 6: 'SPSS_NO_VARSETS', 7: 'SPSS_EMPTY_VARSETS', 8: 'SPSS_NO_LABELS', 9: 'SPSS_NO_LABEL', 10: 'SPSS_NO_CASEWGT', 11: 'SPSS_NO_DATEINFO', 12: 'SPSS_NO_MULTRESP', 13: 'SPSS_EMPTY_MULTRESP', 14: 'SPSS_NO_DEW', 15: 'SPSS_EMPTY_DEW', 16: 'SPSS_SHORTSTR_EXP', 17: 'SPSS_INVALID_VARTYPE', 18: 'SPSS_INVALID_MISSFOR', 19: 'SPSS_INVALID_COMPSW', 20: 'SPSS_INVALID_PRFOR', 21: 'SPSS_INVALID_WRFOR', 22: 'SPSS_INVALID_DATE', 23: 'SPSS_INVALID_TIME', 24: 'SPSS_NO_VARIABLES', 25: 'SPSS_MIXED_TYPES', 27: 'SPSS_DUP_VALUE', 28: 'SPSS_INVALID_CASEWGT', 29: 'SPSS_INCOMPATIBLE_DICT', 30: 'SPSS_DICT_COMMIT', 31: 'SPSS_DICT_NOTCOMMIT', 33: 'SPSS_NO_TYPE2', 41: 'SPSS_NO_TYPE73', 45: 'SPSS_INVALID_DATEINFO', 46: 'SPSS_NO_TYPE999', 47: 'SPSS_EXC_STRVALUE', 48: 'SPSS_CANNOT_FREE', 49: 'SPSS_BUFFER_SHORT', 50: 'SPSS_INVALID_CASE', 51: 'SPSS_INTERNAL_VLABS', 52: 'SPSS_INCOMPAT_APPEND', 53: 'SPSS_INTERNAL_D_A', 54: 'SPSS_FILE_BADTEMP', 55: 'SPSS_DEW_NOFIRST', 56: 'SPSS_INVALID_MEASURELEVEL', 57: 'SPSS_INVALID_7SUBTYPE', 58: 'SPSS_INVALID_VARHANDLE', 59: 'SPSS_INVALID_ENCODING', 60: 'SPSS_FILES_OPEN', 70: 'SPSS_INVALID_MRSETDEF', 71: 'SPSS_INVALID_MRSETNAME', 72: 'SPSS_DUP_MRSETNAME', 73: 'SPSS_BAD_EXTENSION', 74: 'SPSS_INVALID_EXTENDEDSTRING', 75: 'SPSS_INVALID_ATTRNAME', 76: 'SPSS_INVALID_ATTRDEF', 77: 'SPSS_INVALID_MRSETINDEX', 78: 'SPSS_INVALID_VARSETDEF', 79: 'SPSS_INVALID_ROLE'} printTypes = {1: ('SPSS_FMT_A', 'Alphanumeric'), 2: ('SPSS_FMT_AHEX', 'Alphanumeric hexadecimal'), 3: ('SPSS_FMT_COMMA', 'F Format with commas'), 4: ('SPSS_FMT_DOLLAR', 'Commas and floating dollar sign'), 5: ('SPSS_FMT_F', 'Default Numeric Format'), 6: ('SPSS_FMT_IB', 'Integer binary'), 7: ('SPSS_FMT_PIBHEX', 'Positive integer binary - hex'), 8: ('SPSS_FMT_P', 'Packed decimal'), 9: ('SPSS_FMT_PIB', 'Positive integer binary unsigned'), 10: ('SPSS_FMT_PK', 'Positive integer binary unsigned'), 11: ('SPSS_FMT_RB', 'Floating point binary'), 12: ('SPSS_FMT_RBHEX', 'Floating point binary hex'), 15: ('SPSS_FMT_Z', 'Zoned decimal'), 16: ('SPSS_FMT_N', 'N Format- unsigned with leading 0s'), 17: ('SPSS_FMT_E', 'E Format- with explicit power of 10'), 20: ('SPSS_FMT_DATE', 'Date format dd-mmm-yyyy'), 21: ('SPSS_FMT_TIME', 'Time format hh:mm:ss.s'), 22: ('SPSS_FMT_DATE_TIME', 'Date and Time'), 23: ('SPSS_FMT_ADATE', 'Date format dd-mmm-yyyy'), 24: ('SPSS_FMT_JDATE', 'Julian date - yyyyddd'), 25: ('SPSS_FMT_DTIME', 'Date-time dd hh:mm:ss.s'), 26: ('SPSS_FMT_WKDAY', 'Day of the week'), 27: ('SPSS_FMT_MONTH', 'Month'), 28: ('SPSS_FMT_MOYR', 'mmm yyyy'), 29: ('SPSS_FMT_QYR', 'q Q yyyy'), 30: ('SPSS_FMT_WKYR', 'ww WK yyyy'), 31: ('SPSS_FMT_PCT', 'Percent - F followed by %'), 32: ('SPSS_FMT_DOT', 'Like COMMA, switching dot for comma'), 33: ('SPSS_FMT_CCA', 'User Programmable currency format'), 34: ('SPSS_FMT_CCB', 'User Programmable currency format'), 35: ('SPSS_FMT_CCC', 'User Programmable currency format'), 36: ('SPSS_FMT_CCD', 'User Programmable currency format'), 37: ('SPSS_FMT_CCE', 'User Programmable currency format'), 38: ('SPSS_FMT_EDATE','Date in dd/mm/yyyy style'), 39: ('SPSS_FMT_SDATE', 'Date in yyyy/mm/dd style')} class SavReader(object): """ Read Spss system files (.sav) Parameters: -savFileName: the file name of the spss data file -returnHeader: Boolean that indicates whether the first record should be a list of variable names (default is True) -recodeSysmisTo: indicates to which value missing values should be recoded (default = ""), -selectVars: indicates which variables in the file should be selected. The variables should be specified as a list or a tuple of valid variable names. If None is specified, all variables in the file are used (default is None) -verbose: Boolean that indicates whether information about the spss data file (e.g., number of cases, variable names, file size) should be printed on the screen (default: True). Typical use: savFileName = "d:/someFile.sav" with SavReader(savFileName) as sav: header = sav.next() for line in sav: process(line) """ def __init__(self, savFileName, returnHeader=True, recodeSysmisTo="", selectVars=None, verbose=True): """ Constructor. Initializes all vars that can be recycled """ self.savFileName = savFileName self.returnHeader = returnHeader self.recodeSysmisTo = recodeSysmisTo self.selectVars = selectVars self.verbose = verbose self.CUT_OFF = 10**-10 # values below cut-off are considered sysmis self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0) self.numVars = ctypes.c_int() self.numVarsPtr = ctypes.byref(self.numVars) self.nCases = ctypes.c_long() self.numofCasesPtr = ctypes.byref(self.nCases) self.printType = ctypes.c_int() self.printDec = ctypes.c_int() self.printWid = ctypes.c_int() self.printTypePtr = ctypes.byref(self.printType) self.printDecPtr = ctypes.byref(self.printDec) self.printWidPtr = ctypes.byref(self.printWid) self.attribNames = ctypes.c_char_p() self.attribText = ctypes.c_char_p() self.nAttributes = ctypes.c_int() self.attribNamesPtr = ctypes.byref(self.attribNames) self.attribTextPtr = ctypes.byref(self.attribText) self.nAttributesPtr = ctypes.byref(self.nAttributes) self.numValue = ctypes.c_double() self.numValuePtr = ctypes.byref(self.numValue) self.assumedCharWid = 200 self.charValue = ctypes.create_string_buffer(self.assumedCharWid) self.charValuePtr = ctypes.byref(self.charValue) self.valueSize = ctypes.c_int(self.assumedCharWid) self.retcode, self.spssio, self.fh, self.varHandles, self.numVars_, \ self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \ self.printTypeLabels, self.varWids = self.readBasicSavFileInfo() def __enter__(self): """ This function opens the spss data file.""" return self.readSavFile(self.returnHeader, self.recodeSysmisTo, self.selectVars) def __exit__(self, type, value, tb): """ This function closes the spss data file.""" if type is not None: pass # Exception occurred self.spssio.spssCloseRead(self.fh) def readBasicSavFileInfo(self): """ This function reads and returns some basic information of the open spss data file. It returns the following variables: retcode: the return code (0 means OK) spssio: the spss i/o C module, opened with ctypes.windll.spssio32 fh: the file handle varHandles: a dictionary with var names as keys and var handles as values numVars: the number of variables in the spss data file nCases: the number of cases (records) in the spss data file varNames: a list of the var names in the spss data file varTypes: a dictionary with var names as keys and var types as values printTypesFile: a dictionary with var names as keys and print types as values printTypeLabels: a dictionary with var names as keys and print type labels as values varWids: : a dictionary with var names as keys and var widths as values """ self.retcode, self.spssio, self.fh = self.loadSavFile(self.savFileName) numVars = self.getNumberofVariables(self.fh, self.spssio)[1] nCases = self.getNumberofCases(self.fh, self.spssio)[1] varNames, varTypes_ = self.getVarInfo(self.fh, self.spssio) varTypes, printTypesFile, varWids, varHandles, printDecs, \ printWids = {}, {}, {}, {}, {}, {} for i, varName in enumerate(varNames): varTypes[varName] = varTypes_[i] retcode, printType, printDec, printWid = \ self.getVarPrintFormat(self.fh, self.spssio, variable=varName) printTypesFile[varName] = printType varWids[varName] = printWid varHandles[varName] = self.getVarHandle(self.fh, self.spssio, variable=varName)[1] printDecs[varName] = printDec printWids[varName] = printWid printTypeLabels = dict([(varName, printTypes[printType][0]) for varName, printType in printTypesFile.iteritems()]) fmts = dict([(varName, printTypeLabels[varName].split("_")[-1]) for varName in varNames]) if self.verbose: self.getFileReport(savFileName, varNames, varTypes, fmts, printDecs, printWids, nCases) return retcode, self.spssio, self.fh, varHandles, numVars, nCases, varNames, \ varTypes, printTypesFile, printTypeLabels, varWids def loadSavFile(self, savFileName): """ This function loads the spss I/O file (.dll or .so file) and opens the spss data file for reading.""" if sys.platform.lower().startswith("win"): try: os.environ["PATH"] += ";" + os.path.abspath(".") ctypes.cdll.LoadLibrary("spssio32.dll") spssio = ctypes.windll.spssio32 libc = ctypes.cdll.msvcrt except WindowsError, e: msg = "Cannot find spssio32.dll in '%s'.\n" % os.path.abspath(".") + \ "Py file and Dll should live in the same directory [%s]." % e raise Exception, msg ## elif sys.platform.lower().startswith("linux"): ## os.environ["PATH"] += ":" + os.path.abspath(".") ## ctypes.cdll.LoadLibrary("libspssdio.so.1") ## spssio = ctypes.CDLL("libspssdio.so") ## ctypes.cdll.LoadLibrary("libc.so.6") ## libc = ctypes.CDLL("libc.so.6") if os.path.exists(self.savFileName): fh = libc._fdopen(self.savFileName, "rb") # fopen() on linux fhPtr = ctypes.byref(ctypes.c_int(fh)) retcode = spssio.spssOpenRead(ctypes.c_char_p(self.savFileName), fhPtr) return retcode, spssio, fh else: raise Exception, "File '%s' does not exist!" % fn def getNumberofVariables(self, fh, spssio): """ This function reports the number of variables present in a data file.""" retcode = spssio.spssGetNumberofVariables(fh, self.numVarsPtr) return retcode, self.numVars.value def getVarNameAndType(self, fh, spssio, iVar): """ Get variable name and type. The variable type code is an integer in the range 0–32767, 0 indicating a numeric variable and a positive value indicating a string variable of that size.""" varNameBuff = ctypes.create_string_buffer(65) varNamePtr = ctypes.byref(varNameBuff) varType = ctypes.c_int() varTypePtr = ctypes.byref(varType) retcode = spssio.spssGetVarInfo(fh, iVar, varNamePtr, varTypePtr) return varNameBuff.value, varType.value def getVarInfo(self, fh, spssio): """ This function gets the name and type of one of the variables present in a data file.""" spssio.spssGetNumberofVariables(fh, self.numVarsPtr) varNames, varTypes = [], [] for iVar in range(self.numVars.value): varName, varType = self.getVarNameAndType(fh, spssio, iVar) varNames.append(varName) varTypes.append(varType) return varNames, varTypes def getNumberofCases(self, fh, spssio): """ This function reports the number of cases present in a data file""" retcode = spssio.spssGetNumberofCases(fh, self.numofCasesPtr) return retcode, self.nCases.value def getVarHandle(self, fh, spssio, variable): """This function returns a handle for a variable, which can then be used to read values of the variable.""" self.varName = ctypes.c_char_p(variable) self.varHandle = ctypes.c_double() self.varHandlePtr = ctypes.byref(self.varHandle) retcode = spssio.spssGetVarHandle(fh, self.varName, self.varHandlePtr) return retcode, self.varHandle def getVarAttributes(self, fh, spssio, variable): """This function returns all the attributes for a single variable.""" retcode = spssio.spssGetVarAttributes(fh, self.varName, self.attribNamesPtr, self.attribTextPtr, self.nAttributesPtr) return retcode, self.attribNames.value, self.attribText.value, \ self.nAttributes.value def getValueNumeric(self, fh, c_func, varHandle): """ This function gets the value of a numeric variable for the current case, which is the case read by the most recent call to spssReadCaseRecord.""" retcode = c_func(fh, varHandle, self.numValuePtr) return retcode, self.numValue.value def getValueChar(self, fh, c_func, varHandle): """This function gets the value of a string variable for the current case, which is the case read by the most recent call to spssReadCaseRecord.""" retcode = c_func(fh, varHandle, self.charValuePtr, self.valueSize) return retcode, self.charValue.value def getVarPrintFormat(self, fh, spssio, variable): """ This function reports the print format of a variable. Format type, number of decimal places, and field width are returned. """ self.varName = ctypes.c_char_p(variable) retcode = spssio.spssGetVarPrintFormat(fh, self.varName, self.printTypePtr, self.printDecPtr, self.printWidPtr) return retcode, self.printType.value, self.printDec.value, \ self.printWid.value ##def getSystemSysmisVal(spssio): ## return spssio.spssSysmisVal() def formatValue(self, fh, spssio, variable, value, printTypeLabel, varWid, recodeSysmisTo): """ This function formats date fields to ISO dates (yyyy-mm-dd), plus some other date/time formats. The SPSS N format is formatted to a character value with leading zeroes.""" supportedDates = {'SPSS_FMT_DATE': '%Y-%m-%d', 'SPSS_FMT_JDATE': '%Y-%m-%d', 'SPSS_FMT_EDATE': '%Y-%m-%d', 'SPSS_FMT_SDATE': '%Y-%m-%d', 'SPSS_FMT_DATE_TIME':'%Y-%m-%d %H:%M:%S', 'SPSS_FMT_WKDAY': '%A %H:%M:%S', 'SPSS_FMT_ADATE': '%Y-%m-%d', 'SPSS_FMT_WKDAY': '%A', 'SPSS_FMT_MONTH': '%B', 'SPSS_FMT_MOYR': '%B %Y', 'SPSS_FMT_WKYR': '%W WK %Y'} value = recodeSysmisTo if value < self.CUT_OFF else value if printTypeLabel in supportedDates: fmt = supportedDates[printTypeLabel] return self.spss2strDate(value, fmt, recodeSysmisTo) elif printTypeLabel == 'SPSS_FMT_N': value = str(value).zfill(varWid) return value else: return value def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo): """ This function converts internal SPSS dates (number of seconds since midnight, Oct 14, 1582 (the beginning of the Gregorian calendar)) to a human-readable format """ try: theDate = self.gregorianEpoch + datetime.timedelta(seconds=spssDateValue) return datetime.datetime.strftime(theDate, fmt) except TypeError: return recodeSysmisTo except ValueError: return recodeSysmisTo except OverflowError: return recodeSysmisTo def getFileReport(self, savFileName, varNames, varTypes, fmts, printDecs, printWids, nCases): """ This function prints a report about basic file characteristics """ bytes = os.path.getsize(savFileName) kb = float(bytes) / 2**10 mb = float(bytes) / 2**20 (fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB") print "*" * 70 print "*File '%s' (%5.2f %s) has %s columns (variables) and %s rows (%s values)" % \ (savFileName, fileSize, label, len(varNames), nCases, len(varNames) * nCases) print "*It contains the following variables:" for cnt, varName in enumerate(varNames): label = "string" if varTypes[varName] > 0 else "numerical" print "%03d. %s (%s%d.%d - %s)" \ % (cnt+1, varName, fmts[varName], printWids[varName], printDecs[varName], label) print "*" * 70 def readSavFile(self, returnHeader=True, recodeSysmisTo="", selectVars=None): """ This is the main function of this class. It is a generator, which returns one record of the spss data file at a time. """ debug = False if retcodes[self.retcode] == "SPSS_OK": if not isinstance(selectVars, (list, tuple)) and selectVars is not None: raise Exception, "Variable names list misspecified." + \ "Must be 'None' or a list or tuple of existing variables" if selectVars is not None: if set(selectVars).intersection(set(self.varNames)): self.varNames = selectVars else: raise Exception, "Variable names list misspecified" if returnHeader: yield self.varNames # avoiding dots inside the loops # http://wiki.python.org/moin/PythonSpeed/PerformanceTips#Avoiding_dots... readCaseRecord = self.spssio.spssReadCaseRecord spssGetValueNumeric = self.spssio.spssGetValueNumeric spssGetValueChar = self.spssio.spssGetValueChar for i in range(self.nCases_): readCaseRecord(self.fh) record = [] for varName in self.varNames: # numerical values if self.varTypes[varName] == 0: rawValue = self.getValueNumeric(self.fh, spssGetValueNumeric, self.varHandles[varName])[1] if self.printTypeLabels[varName] == 'SPSS_FMT_F': # this is assumed to be the most common format, by far. # there will be no need to call the (expensive) formatValue function value = rawValue if rawValue > self.CUT_OFF else recodeSysmisTo else: value = self.formatValue(self.fh, self.spssio, varName, rawValue, self.printTypeLabels[varName], self.varWids[varName], recodeSysmisTo) # string values else: value = self.getValueChar(self.fh, spssGetValueChar, self.varHandles[varName])[1].rstrip() record.append(value) if debug and i+1 % 100 == 0: print "record", i+1, record yield record else: try: print "Error", retcodes[self.retcode] except KeyError: print "Unknown error code (%d)" % self.retcode finally: raise Exception, "You fail!" def calculateFrequency(sav): """ This function returns a frequency count for each variable in the spss data file """ freqs = {} for lino, line in enumerate(sav): if lino == 0: varNames = line else: for varName in varNames: value = line[varNames.index(varName)] value = "(missing)" if value == "" else value try: freqs[varName] except KeyError: freqs[varName] = {} try: freqs[varName][value] += 1 except KeyError: freqs[varName][value] = 1 return freqs if __name__ == "__main__": help(SavReader) import contextlib, csv ## ----- Get some basic file info savFileName = r"C:\Program Files\SPSS Evaluation\Employee data.sav" numVars, nCases, varNames, varTypes, printTypesFile, printTypeLabels, varWids = \ SavReader(savFileName).readBasicSavFileInfo()[4:] ## ----- Typical use with SavReader(savFileName, recodeSysmisTo=999, selectVars=["educ"]) as sav: header = sav.next() for line in sav: pass # do stuff ## ----- Convert file to .csv csvFileName = "d:/temp/test.csv" with contextlib.nested(SavReader(savFileName), open(csvFileName, "wb")) as (sav, f): writer = csv.writer(f) for line in sav: writer.writerow(line) print "Done! Csv file written: %s" % f.name ## ----- Run frequency counts def main(savFileName): with SavReader(savFileName, selectVars=["educ", "gender"]) as sav: freqs = calculateFrequency(sav) for var, values in freqs.iteritems(): print "\n\n", 10 * "*", var.upper(), 10 * "*" for val, freq in values.iteritems(): print val, "--", freq main(savFileName)