#!/usr/bin/env python
# -*- coding: cp1252 -*-
""" A Python interface to the IBM SPSS Statistics Input Output Module
(Windows: spssio32.dll)"""
# spssio32.dll and documentation can be downloaded here:
# https://www.ibm.com/developerworks/mydeveloperworks/wikis/home/wiki/We70df3195ec8_4f95_9773_42e448fa9029/page/Downloads%20for%20IBM%C2%AE%20SPSS%C2%AE%20Statistics?lang=en
# TO DO: make this work under Linux. When I tried the .so file,
# I had a hard time finding all the necessary dependencies.
# The .so file versions that are needed are rather old.
# NOTE: This is rather slow in processing huge files.
# ANY FEEDBACK ON THIS CODE IS WELCOME: "@".join(["fomcl", "yahoo.com"])
from __future__ import with_statement # only Python 2.5
import sys, os, ctypes, datetime
try:
import psyco
psyco.full()
except ImportError:
pass
__author__ = 'Albert-Jan Roskam'
__version__ = '1.0.0'
retcodes = {0: 'SPSS_OK',
1: 'SPSS_EXC_LEN64',
2: 'SPSS_EXC_VARLABEL',
3: 'SPSS_FILE_RERROR',
4: 'SPSS_EXC_VALLABEL',
5: 'SPSS_FILE_END',
6: 'SPSS_NO_VARSETS',
7: 'SPSS_EMPTY_VARSETS',
8: 'SPSS_NO_LABELS',
9: 'SPSS_NO_LABEL',
10: 'SPSS_NO_CASEWGT',
11: 'SPSS_NO_DATEINFO',
12: 'SPSS_NO_MULTRESP',
13: 'SPSS_EMPTY_MULTRESP',
14: 'SPSS_NO_DEW',
15: 'SPSS_EMPTY_DEW',
16: 'SPSS_SHORTSTR_EXP',
17: 'SPSS_INVALID_VARTYPE',
18: 'SPSS_INVALID_MISSFOR',
19: 'SPSS_INVALID_COMPSW',
20: 'SPSS_INVALID_PRFOR',
21: 'SPSS_INVALID_WRFOR',
22: 'SPSS_INVALID_DATE',
23: 'SPSS_INVALID_TIME',
24: 'SPSS_NO_VARIABLES',
25: 'SPSS_MIXED_TYPES',
27: 'SPSS_DUP_VALUE',
28: 'SPSS_INVALID_CASEWGT',
29: 'SPSS_INCOMPATIBLE_DICT',
30: 'SPSS_DICT_COMMIT',
31: 'SPSS_DICT_NOTCOMMIT',
33: 'SPSS_NO_TYPE2',
41: 'SPSS_NO_TYPE73',
45: 'SPSS_INVALID_DATEINFO',
46: 'SPSS_NO_TYPE999',
47: 'SPSS_EXC_STRVALUE',
48: 'SPSS_CANNOT_FREE',
49: 'SPSS_BUFFER_SHORT',
50: 'SPSS_INVALID_CASE',
51: 'SPSS_INTERNAL_VLABS',
52: 'SPSS_INCOMPAT_APPEND',
53: 'SPSS_INTERNAL_D_A',
54: 'SPSS_FILE_BADTEMP',
55: 'SPSS_DEW_NOFIRST',
56: 'SPSS_INVALID_MEASURELEVEL',
57: 'SPSS_INVALID_7SUBTYPE',
58: 'SPSS_INVALID_VARHANDLE',
59: 'SPSS_INVALID_ENCODING',
60: 'SPSS_FILES_OPEN',
70: 'SPSS_INVALID_MRSETDEF',
71: 'SPSS_INVALID_MRSETNAME',
72: 'SPSS_DUP_MRSETNAME',
73: 'SPSS_BAD_EXTENSION',
74: 'SPSS_INVALID_EXTENDEDSTRING',
75: 'SPSS_INVALID_ATTRNAME',
76: 'SPSS_INVALID_ATTRDEF',
77: 'SPSS_INVALID_MRSETINDEX',
78: 'SPSS_INVALID_VARSETDEF',
79: 'SPSS_INVALID_ROLE'}
printTypes = {1: ('SPSS_FMT_A', 'Alphanumeric'),
2: ('SPSS_FMT_AHEX', 'Alphanumeric hexadecimal'),
3: ('SPSS_FMT_COMMA', 'F Format with commas'),
4: ('SPSS_FMT_DOLLAR', 'Commas and floating dollar sign'),
5: ('SPSS_FMT_F', 'Default Numeric Format'),
6: ('SPSS_FMT_IB', 'Integer binary'),
7: ('SPSS_FMT_PIBHEX', 'Positive integer binary - hex'),
8: ('SPSS_FMT_P', 'Packed decimal'),
9: ('SPSS_FMT_PIB', 'Positive integer binary unsigned'),
10: ('SPSS_FMT_PK', 'Positive integer binary unsigned'),
11: ('SPSS_FMT_RB', 'Floating point binary'),
12: ('SPSS_FMT_RBHEX', 'Floating point binary hex'),
15: ('SPSS_FMT_Z', 'Zoned decimal'),
16: ('SPSS_FMT_N', 'N Format- unsigned with leading 0s'),
17: ('SPSS_FMT_E', 'E Format- with explicit power of 10'),
20: ('SPSS_FMT_DATE', 'Date format dd-mmm-yyyy'),
21: ('SPSS_FMT_TIME', 'Time format hh:mm:ss.s'),
22: ('SPSS_FMT_DATE_TIME', 'Date and Time'),
23: ('SPSS_FMT_ADATE', 'Date format dd-mmm-yyyy'),
24: ('SPSS_FMT_JDATE', 'Julian date - yyyyddd'),
25: ('SPSS_FMT_DTIME', 'Date-time dd hh:mm:ss.s'),
26: ('SPSS_FMT_WKDAY', 'Day of the week'),
27: ('SPSS_FMT_MONTH', 'Month'),
28: ('SPSS_FMT_MOYR', 'mmm yyyy'),
29: ('SPSS_FMT_QYR', 'q Q yyyy'),
30: ('SPSS_FMT_WKYR', 'ww WK yyyy'),
31: ('SPSS_FMT_PCT', 'Percent - F followed by %'),
32: ('SPSS_FMT_DOT', 'Like COMMA, switching dot for comma'),
33: ('SPSS_FMT_CCA', 'User Programmable currency format'),
34: ('SPSS_FMT_CCB', 'User Programmable currency format'),
35: ('SPSS_FMT_CCC', 'User Programmable currency format'),
36: ('SPSS_FMT_CCD', 'User Programmable currency format'),
37: ('SPSS_FMT_CCE', 'User Programmable currency format'),
38: ('SPSS_FMT_EDATE','Date in dd/mm/yyyy style'),
39: ('SPSS_FMT_SDATE', 'Date in yyyy/mm/dd style')}
class SavReader(object):
""" Read Spss system files (.sav)
Parameters:
-savFileName: the file name of the spss data file
-returnHeader: Boolean that indicates whether the first record should
be a list of variable names (default is True)
-recodeSysmisTo: indicates to which value missing values should
be recoded (default = ""),
-selectVars: indicates which variables in the file should be selected.
The variables should be specified as a list or a tuple of
valid variable names. If None is specified, all variables
in the file are used (default is None)
-verbose: Boolean that indicates whether information about the spss data file
(e.g., number of cases, variable names, file size) should be printed on
the screen (default: True).
Typical use:
savFileName = "d:/someFile.sav"
with SavReader(savFileName) as sav:
header = sav.next()
for line in sav:
process(line)
"""
def __init__(self, savFileName, returnHeader=True, recodeSysmisTo="",
selectVars=None, verbose=True):
""" Constructor. Initializes all vars that can be recycled """
self.savFileName = savFileName
self.returnHeader = returnHeader
self.recodeSysmisTo = recodeSysmisTo
self.selectVars = selectVars
self.verbose = verbose
self.CUT_OFF = 10**-10 # values below cut-off are considered sysmis
self.gregorianEpoch = datetime.datetime(1582, 10, 14, 0, 0, 0)
self.numVars = ctypes.c_int()
self.numVarsPtr = ctypes.byref(self.numVars)
self.nCases = ctypes.c_long()
self.numofCasesPtr = ctypes.byref(self.nCases)
self.printType = ctypes.c_int()
self.printDec = ctypes.c_int()
self.printWid = ctypes.c_int()
self.printTypePtr = ctypes.byref(self.printType)
self.printDecPtr = ctypes.byref(self.printDec)
self.printWidPtr = ctypes.byref(self.printWid)
self.attribNames = ctypes.c_char_p()
self.attribText = ctypes.c_char_p()
self.nAttributes = ctypes.c_int()
self.attribNamesPtr = ctypes.byref(self.attribNames)
self.attribTextPtr = ctypes.byref(self.attribText)
self.nAttributesPtr = ctypes.byref(self.nAttributes)
self.numValue = ctypes.c_double()
self.numValuePtr = ctypes.byref(self.numValue)
self.assumedCharWid = 200
self.charValue = ctypes.create_string_buffer(self.assumedCharWid)
self.charValuePtr = ctypes.byref(self.charValue)
self.valueSize = ctypes.c_int(self.assumedCharWid)
self.retcode, self.spssio, self.fh, self.varHandles, self.numVars_, \
self.nCases_, self.varNames, self.varTypes, self.printTypesFile, \
self.printTypeLabels, self.varWids = self.readBasicSavFileInfo()
def __enter__(self):
""" This function opens the spss data file."""
return self.readSavFile(self.returnHeader, self.recodeSysmisTo,
self.selectVars)
def __exit__(self, type, value, tb):
""" This function closes the spss data file."""
if type is not None:
pass # Exception occurred
self.spssio.spssCloseRead(self.fh)
def readBasicSavFileInfo(self):
""" This function reads and returns some basic information of the open
spss data file. It returns the following variables:
retcode: the return code (0 means OK)
spssio: the spss i/o C module, opened with ctypes.windll.spssio32
fh: the file handle
varHandles: a dictionary with var names as keys and var handles as values
numVars: the number of variables in the spss data file
nCases: the number of cases (records) in the spss data file
varNames: a list of the var names in the spss data file
varTypes: a dictionary with var names as keys and var types as values
printTypesFile: a dictionary with var names as keys and print types as values
printTypeLabels: a dictionary with var names as keys and print type labels as values
varWids: : a dictionary with var names as keys and var widths as values
"""
self.retcode, self.spssio, self.fh = self.loadSavFile(self.savFileName)
numVars = self.getNumberofVariables(self.fh, self.spssio)[1]
nCases = self.getNumberofCases(self.fh, self.spssio)[1]
varNames, varTypes_ = self.getVarInfo(self.fh, self.spssio)
varTypes, printTypesFile, varWids, varHandles, printDecs, \
printWids = {}, {}, {}, {}, {}, {}
for i, varName in enumerate(varNames):
varTypes[varName] = varTypes_[i]
retcode, printType, printDec, printWid = \
self.getVarPrintFormat(self.fh, self.spssio,
variable=varName)
printTypesFile[varName] = printType
varWids[varName] = printWid
varHandles[varName] = self.getVarHandle(self.fh, self.spssio,
variable=varName)[1]
printDecs[varName] = printDec
printWids[varName] = printWid
printTypeLabels = dict([(varName,
printTypes[printType][0])
for varName, printType in printTypesFile.iteritems()])
fmts = dict([(varName, printTypeLabels[varName].split("_")[-1])
for varName in varNames])
if self.verbose:
self.getFileReport(self.savFileName, varNames, varTypes, fmts,
printDecs, printWids, nCases)
return retcode, self.spssio, self.fh, varHandles, numVars, nCases, varNames, \
varTypes, printTypesFile, printTypeLabels, varWids
def loadSavFile(self, savFileName):
""" This function loads the spss I/O file (.dll or .so file) and opens
the spss data file for reading."""
if sys.platform.lower().startswith("win"):
try:
os.environ["PATH"] += ";" + os.path.abspath(".")
ctypes.cdll.LoadLibrary("spssio32.dll")
spssio = ctypes.windll.spssio32
libc = ctypes.cdll.msvcrt
except WindowsError, e:
msg = "Cannot find spssio32.dll in '%s'.\n" % os.path.abspath(".") + \
"Py file and Dll should live in the same directory [%s]." % e
raise Exception, msg
## elif sys.platform.lower().startswith("linux"):
## os.environ["PATH"] += ":" + os.path.abspath(".")
## ctypes.cdll.LoadLibrary("libspssdio.so.1")
## spssio = ctypes.CDLL("libspssdio.so")
## ctypes.cdll.LoadLibrary("libc.so.6")
## libc = ctypes.CDLL("libc.so.6")
if os.path.exists(self.savFileName):
fh = libc._fdopen(self.savFileName, "rb") # fopen() on linux
fhPtr = ctypes.byref(ctypes.c_int(fh))
retcode = spssio.spssOpenRead(ctypes.c_char_p(self.savFileName), fhPtr)
return retcode, spssio, fh
else:
raise Exception, "File '%s' does not exist!" % fn
def getNumberofVariables(self, fh, spssio):
""" This function reports the number of variables present in a data file."""
retcode = spssio.spssGetNumberofVariables(fh, self.numVarsPtr)
return retcode, self.numVars.value
def getVarNameAndType(self, fh, spssio, iVar):
""" Get variable name and type. The variable type code is an integer
in the range 0–32767, 0 indicating a numeric variable and a positive
value indicating a string variable of that size."""
varNameBuff = ctypes.create_string_buffer(65)
varNamePtr = ctypes.byref(varNameBuff)
varType = ctypes.c_int()
varTypePtr = ctypes.byref(varType)
retcode = spssio.spssGetVarInfo(fh, iVar, varNamePtr, varTypePtr)
return varNameBuff.value, varType.value
def getVarInfo(self, fh, spssio):
""" This function gets the name and type of one of the variables
present in a data file."""
spssio.spssGetNumberofVariables(fh, self.numVarsPtr)
varNames, varTypes = [], []
for iVar in range(self.numVars.value):
varName, varType = self.getVarNameAndType(fh, spssio, iVar)
varNames.append(varName)
varTypes.append(varType)
return varNames, varTypes
def getNumberofCases(self, fh, spssio):
""" This function reports the number of cases present in a data file"""
retcode = spssio.spssGetNumberofCases(fh, self.numofCasesPtr)
return retcode, self.nCases.value
def getVarHandle(self, fh, spssio, variable):
"""This function returns a handle for a variable, which can then be
used to read values of the variable."""
self.varName = ctypes.c_char_p(variable)
self.varHandle = ctypes.c_double()
self.varHandlePtr = ctypes.byref(self.varHandle)
retcode = spssio.spssGetVarHandle(fh, self.varName, self.varHandlePtr)
return retcode, self.varHandle
def getVarAttributes(self, fh, spssio, variable):
"""This function returns all the attributes for a single variable."""
retcode = spssio.spssGetVarAttributes(fh,
self.varName,
self.attribNamesPtr,
self.attribTextPtr,
self.nAttributesPtr)
return retcode, self.attribNames.value, self.attribText.value, \
self.nAttributes.value
def getValueNumeric(self, fh, c_func, varHandle):
""" This function gets the value of a numeric variable for the current
case, which is the case read by the most recent call to
spssReadCaseRecord."""
retcode = c_func(fh, varHandle, self.numValuePtr)
return retcode, self.numValue.value
def getValueChar(self, fh, c_func, varHandle):
"""This function gets the value of a string variable for the current
case, which is the case read by the most recent call to
spssReadCaseRecord."""
retcode = c_func(fh, varHandle, self.charValuePtr, self.valueSize)
return retcode, self.charValue.value
def getVarPrintFormat(self, fh, spssio, variable):
""" This function reports the print format of a variable. Format
type, number of decimal places, and field width are returned. """
self.varName = ctypes.c_char_p(variable)
retcode = spssio.spssGetVarPrintFormat(fh,
self.varName,
self.printTypePtr,
self.printDecPtr,
self.printWidPtr)
return retcode, self.printType.value, self.printDec.value, \
self.printWid.value
##def getSystemSysmisVal(spssio):
## return spssio.spssSysmisVal()
def formatValue(self, fh, spssio, variable, value, printTypeLabel,
varWid, recodeSysmisTo):
""" This function formats date fields to ISO dates (yyyy-mm-dd), plus
some other date/time formats. The SPSS N format is formatted to a
character value with leading zeroes."""
supportedDates = {'SPSS_FMT_DATE': '%Y-%m-%d',
'SPSS_FMT_JDATE': '%Y-%m-%d',
'SPSS_FMT_EDATE': '%Y-%m-%d',
'SPSS_FMT_SDATE': '%Y-%m-%d',
'SPSS_FMT_DATE_TIME':'%Y-%m-%d %H:%M:%S',
'SPSS_FMT_WKDAY': '%A %H:%M:%S',
'SPSS_FMT_ADATE': '%Y-%m-%d',
'SPSS_FMT_WKDAY': '%A',
'SPSS_FMT_MONTH': '%B',
'SPSS_FMT_MOYR': '%B %Y',
'SPSS_FMT_WKYR': '%W WK %Y'}
value = recodeSysmisTo if value < self.CUT_OFF else value
if printTypeLabel in supportedDates:
fmt = supportedDates[printTypeLabel]
return self.spss2strDate(value, fmt, recodeSysmisTo)
elif printTypeLabel == 'SPSS_FMT_N':
value = str(value).zfill(varWid)
return value
else:
return value
def spss2strDate(self, spssDateValue, fmt, recodeSysmisTo):
""" This function converts internal SPSS dates (number of seconds
since midnight, Oct 14, 1582 (the beginning of the Gregorian calendar))
to a human-readable format """
try:
theDate = self.gregorianEpoch + datetime.timedelta(seconds=spssDateValue)
return datetime.datetime.strftime(theDate, fmt)
except TypeError:
return recodeSysmisTo
except ValueError:
return recodeSysmisTo
except OverflowError:
return recodeSysmisTo
def getFileReport(self, savFileName, varNames, varTypes, fmts, printDecs,
printWids, nCases):
""" This function prints a report about basic file characteristics """
bytes = os.path.getsize(savFileName)
kb = float(bytes) / 2**10
mb = float(bytes) / 2**20
(fileSize, label) = (mb, "MB") if mb > 1 else (kb, "kB")
print "*" * 70
print "*File '%s' (%5.2f %s) has %s columns (variables) and %s rows (%s values)" % \
(savFileName, fileSize, label, len(varNames), nCases, len(varNames) * nCases)
print "*It contains the following variables:"
for cnt, varName in enumerate(varNames):
label = "string" if varTypes[varName] > 0 else "numerical"
print "%03d. %s (%s%d.%d - %s)" \
% (cnt+1, varName, fmts[varName], printWids[varName], printDecs[varName], label)
print "*" * 70
def readSavFile(self, returnHeader=True, recodeSysmisTo="", selectVars=None):
""" This is the main function of this class. It is a generator, which
returns one record of the spss data file at a time. """
debug = False
if retcodes[self.retcode] == "SPSS_OK":
if not isinstance(selectVars, (list, tuple)) and selectVars is not None:
raise Exception, "Variable names list misspecified." + \
"Must be 'None' or a list or tuple of existing variables"
if selectVars is not None:
if set(selectVars).intersection(set(self.varNames)):
self.varNames = selectVars
else:
raise Exception, "Variable names list misspecified"
if returnHeader:
yield self.varNames
# avoiding dots inside the loops
# http://wiki.python.org/moin/PythonSpeed/PerformanceTips#Avoiding_dots...
readCaseRecord = self.spssio.spssReadCaseRecord
spssGetValueNumeric = self.spssio.spssGetValueNumeric
spssGetValueChar = self.spssio.spssGetValueChar
for i in range(self.nCases_):
readCaseRecord(self.fh)
record = []
for varName in self.varNames:
# numerical values
if self.varTypes[varName] == 0:
rawValue = self.getValueNumeric(self.fh, spssGetValueNumeric,
self.varHandles[varName])[1]
if self.printTypeLabels[varName] == 'SPSS_FMT_F':
# this is assumed to be the most common format, by far.
# there will be no need to call the (expensive) formatValue function
value = rawValue if rawValue > self.CUT_OFF else recodeSysmisTo
else:
value = self.formatValue(self.fh, self.spssio, varName, rawValue,
self.printTypeLabels[varName],
self.varWids[varName], recodeSysmisTo)
# string values
else:
value = self.getValueChar(self.fh, spssGetValueChar,
self.varHandles[varName])[1].rstrip()
record.append(value)
if debug and i+1 % 100 == 0:
print "record", i+1, record
yield record
else:
try:
print "Error", retcodes[self.retcode]
except KeyError:
print "Unknown error code (%d)" % self.retcode
finally:
raise Exception, "You fail!"
def calculateFrequency(sav):
""" This function returns a frequency count for each variable in
the spss data file """
freqs = {}
for lino, line in enumerate(sav):
if lino == 0:
varNames = line
else:
for varName in varNames:
value = line[varNames.index(varName)]
value = "(missing)" if value == "" else value
try:
freqs[varName]
except KeyError:
freqs[varName] = {}
try:
freqs[varName][value] += 1
except KeyError:
freqs[varName][value] = 1
return freqs
if __name__ == "__main__":
help(SavReader)
import contextlib, csv
## ----- Get some basic file info
savFileName = r"C:\Program Files\SPSS Evaluation\Employee data.sav"
numVars, nCases, varNames, varTypes, printTypesFile, printTypeLabels, varWids = \
SavReader(savFileName).readBasicSavFileInfo()[4:]
## ----- Typical use
with SavReader(savFileName, recodeSysmisTo=999, selectVars=["educ"]) as sav:
header = sav.next()
for line in sav:
pass # do stuff
## ----- Convert file to .csv
csvFileName = "d:/temp/test.csv"
with contextlib.nested(SavReader(savFileName), open(csvFileName, "wb")) as (sav, f):
writer = csv.writer(f)
for line in sav:
writer.writerow(line)
print "Done! Csv file written: %s" % f.name
## ----- Run frequency counts
def main(savFileName):
with SavReader(savFileName, selectVars=["educ", "gender"]) as sav:
freqs = calculateFrequency(sav)
for var, values in freqs.iteritems():
print "\n\n", 10 * "*", var.upper(), 10 * "*"
for val, freq in values.iteritems():
print val, "--", freq
main(savFileName)
Diff to Previous Revision
--- revision 1 2011-04-12 18:09:54
+++ revision 2 2011-04-14 10:09:55
@@ -240,7 +240,7 @@
fmts = dict([(varName, printTypeLabels[varName].split("_")[-1])
for varName in varNames])
if self.verbose:
- self.getFileReport(savFileName, varNames, varTypes, fmts,
+ self.getFileReport(self.savFileName, varNames, varTypes, fmts,
printDecs, printWids, nCases)
return retcode, self.spssio, self.fh, varHandles, numVars, nCases, varNames, \