SPSS can output ASCII .por files that include variable definitions (labels, value labels) and data. These files are useful to be able to read but strange enough I couldn't find any documentation anywhere.
I've reverse engineered the format so that it works for the (fairly complex) files I need to open, so maybe it is of some use for other people interfacing SPSS with python.
This code is placed in the public domain as far as allowed by law, parts (if any) that cannot be relased in the public domain are irrevocably licensed to all readers under the Creative Commons license without restrictions ... but if you improve it it would be great if you could share it. The code uses a base N decoder plucked from the web somewhere (can't find it) that was also public domain.
-- wouter (wouter@vanatteveldt.com)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | DEL = '/'
class PorReader(object):
def __init__(self, file):
if type(file) in (str, unicode): file = open(file)
self.file = file
self.pos = -1
self.buffer = ""
def consumeOne(self, skip=False):
p = self.buffer.find(DEL, self.pos+1)
output = ""
while p == -1:
if not skip: output += self.buffer[self.pos+1:]
self.buffer = self.file.read(1024)
self.pos = -1
p = self.buffer.find(DEL, self.pos+1)
if not self.buffer: break
if not skip: output += self.buffer[self.pos+1:p]
self.pos = p
if not skip:
output = output.replace("\r\n", "")
return output
def consume(self, n=1):
return [self.consumeOne() for i in range(n)]
def skip(self, n=1):
for i in range(n):
self.consumeOne(skip=True)
HEAD = 'SPSS for Microsoft Windows Release 15.04'
FLOAT, STR, INT = 0,1,2
class SPSSVariable(object):
def __init__(self, name, label=None, numeric=True, decimals=0):
self.name = name
self.label = label
self.numeric = numeric
self.decimals = decimals
self.valuelabels = None
self.index = None
def __str__(self):
t = 'S'
if self.numeric: t = 'I'
if self.numeric and self.decimals: t = 'F'
return "%s%s%s" % (self.name, (' "%s" ' % self.label if self.label else ''),t)
def splitstring(slen=None, s=None, reader=None):
if slen is None:
slen = reader.consume(2)
if s is None: slen, s = slen
if type(slen) == str: slen = readnum(slen)
while slen > len(s):
if reader:
s += "/"+reader.consumeOne()
else:
raise Exception("!")
keep = s[slen:]
s = s[:slen]
return s, keep
class SPSSFile(object):
def __init__(self, file):
self.variables = []
self.vardict = {}
self.data = []
self.init(file)
def addvar(self, var):
var.index = len(self.variables)
self.variables.append(var)
self.vardict[var.name] = var
def getvar(self, varname):
return self.vardict[varname]
def get(self, var, row):
if type(var) in (str, unicode):
var = self.vardict[var]
return row[var.index]
def init(self, file):
r = PorReader(file)
r.skip(5)
h = r.consumeOne()
if not h.startswith(HEAD): raise Exception("Cannot read .por")
numvars = readnum(h[len(HEAD):])
h = r.skip(1)
keep = r.consumeOne()
while True:
action = keep[0]
#print "ACTION: %s" % action
if action == '7':
data = r.consume(8)
while data[-2][0] <> 'C': data += r.consume()
decimals = readnum(data[4])
numeric = keep[1:] == '0'
name, dummy = splitstring(data[:2])
labellen, label = data[-2:]
label, keep = splitstring(labellen[1:], label, r)
v = SPSSVariable(name, label, numeric, decimals)
self.addvar(v)
#print "ADDED VAR ", v, data, `keep`, labellen[1:]
if action == 'D': # value labels
numvars = readnum(keep[1:])
varnames = []
keep = r.consumeOne()
for i in range(numvars):
name, keep = splitstring(keep, r.consumeOne(), reader=r)
varnames.append(name)
numlabels = readnum(keep)
keep = r.consumeOne()
labels = {}
numeric = self.getvar(varnames[0]).numeric
for i in range(numlabels):
if numeric:
val = readnum(keep)
name, keep = splitstring(reader=r)
else:
val, keep = splitstring(keep, r.consumeOne(), reader=r)
name, keep = splitstring(keep, r.consumeOne(), reader=r)
labels[val] = name
#print "VALUE LABELS", varnames, labels
for varname in varnames:
self.getvar(varname).valuelabels = labels
if action == 'F': # data
keep = keep[1:]
while True:
row = []
for var in self.variables:
if not keep: keep = r.consumeOne()
if keep.startswith("Z"):
return
if var.numeric:
if keep.startswith("*."):
row.append(None)
keep = keep[2:]
else:
try:
row.append(readnum(keep))
except Exception, e:
print row
print "Exception on %s" % var
raise e
keep = ""
else:
slen = keep
x, keep = splitstring(slen, r.consumeOne())
row.append(x)
self.data.append(tuple(row))
if action == 'Z': # data
print "Done!"
return
def _codec(str_in, base_from=36, base_to=10):
"""
Base36 Encoder/Decoder
by Mike Crute (mcrute@gmail.com) on August 26, 2008
This code has been placed in the public domain.
"""
ASCII = { "0": 48, "9": 57, "A": 65, "Z": 90 }
# There are 8 characters between 9 and A
from_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_from)
if (x >= ASCII["0"] and x <= ASCII["9"]) or
(x >= ASCII["A"] and x <= ASCII["Z"])][:base_from]
to_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_to)
if (x >= ASCII["0"] and x <= ASCII["9"]) or
(x >= ASCII["A"] and x <= ASCII["Z"])][:base_to]
x = long(0)
for digit in str(str_in).upper():
x = x * len(from_digits) + from_digits.index(digit)
result = ""
# This is going to assemble our number in reverse order
# so we'll have to fix it before we return it
while x > 0:
result += to_digits[x % len(to_digits)]
x /= len(to_digits)
return result[::-1]
def decode(s):
while s.startswith("0"): s = s[1:]
if not s: return 0
try:
return int(_codec(s, 30, 10))
except ValueError, e:
raise ValueError("Cannot decode %r: %s" % (s, e))
def readnum(s):
neg = s.startswith("-")
if neg: s = s[1:]
if "+" in s:
num, exp = map(decode, s.split("+"))
result = 30**exp
elif "-" in s:
num, exp = map(decode, s.split("-"))
result = 1. / (30**exp)
else:
if "." in s:
i, d = s.split(".")
else:
i, d = s, None
result = decode(i)
if d:
for j, digit in enumerate(d):
result += decode(digit) / 30.**(j+1)
return result * (-1 if neg else 1)
if __name__ == '__main__':
import sys
fn = sys.argv[1]
f = SPSSFile(fn)
print len(f.variables), len(f.data)
|
Issues: 1) Initially I thought that the format was '/' separated, but it turns out that separation depends on the variable you expect, eg strings are generally <string length>/<string> with the next field starting immediately after the string without an extra '/'. This necessitated using a 'keep' variable in between fields. It would be better to refactor with a character based token reader rather than a '/'-splitter. 2) This opens the whole file in memory. For most uses, it would probably be better to read the variable definitions and have an iterator for the rows. 3) I ignore some things I don't use, such as decimal places, user-defined missing values, etc. 4) This code might not work on all files as I don't have the specs, I was glad I got it to work on my files :-)
Mail me if this is of use, I might have made an improved version in the meantime.
Hi,
I just added a program to read spss .sav files. Check out http://code.activestate.com/recipes/577650-python-reader-for-spss-sav-files/
Albert-Jan Roskam