Welcome, guest | Sign In | My Account | Store | Cart

SPSS can output ASCII .por files that include variable definitions (labels, value labels) and data. These files are useful to be able to read but strange enough I couldn't find any documentation anywhere.

I've reverse engineered the format so that it works for the (fairly complex) files I need to open, so maybe it is of some use for other people interfacing SPSS with python.

This code is placed in the public domain as far as allowed by law, parts (if any) that cannot be relased in the public domain are irrevocably licensed to all readers under the Creative Commons license without restrictions ... but if you improve it it would be great if you could share it. The code uses a base N decoder plucked from the web somewhere (can't find it) that was also public domain.

-- wouter (wouter@vanatteveldt.com)

Python, 209 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
DEL = '/'
class PorReader(object):
    def __init__(self, file):
        if type(file) in (str, unicode): file = open(file)
        self.file = file
        self.pos = -1
        self.buffer = ""
    def consumeOne(self, skip=False):
        p = self.buffer.find(DEL, self.pos+1)
        output = ""
        while p == -1:
            if not skip: output += self.buffer[self.pos+1:]
            self.buffer = self.file.read(1024)
            self.pos = -1
            p = self.buffer.find(DEL, self.pos+1)
            if not self.buffer: break
        if not skip: output += self.buffer[self.pos+1:p]
        self.pos = p
        if not skip:
            output = output.replace("\r\n", "")
            return output
    def consume(self, n=1):
        return [self.consumeOne() for i in range(n)]
    def skip(self, n=1):
        for i in range(n):
            self.consumeOne(skip=True)

HEAD = 'SPSS for Microsoft Windows Release 15.04'

FLOAT, STR, INT = 0,1,2

class SPSSVariable(object):
    def __init__(self, name, label=None, numeric=True, decimals=0):
        self.name = name
        self.label = label
        self.numeric = numeric
        self.decimals = decimals
        self.valuelabels = None
        self.index = None
    def __str__(self):
        t = 'S'
        if self.numeric: t = 'I'
        if self.numeric and self.decimals: t = 'F'
        return "%s%s%s" % (self.name, (' "%s" ' % self.label if self.label else ''),t)

def splitstring(slen=None, s=None, reader=None):
    if slen is None:
        slen = reader.consume(2)
    if s is None: slen, s = slen
    if type(slen) == str: slen = readnum(slen)
    while slen > len(s):
        if reader:
            s += "/"+reader.consumeOne()
        else:
            raise Exception("!")
    keep = s[slen:]
    s = s[:slen]
    return s, keep

class SPSSFile(object):
    def __init__(self, file):
        self.variables = []
        self.vardict = {}
        self.data = []
        self.init(file)
    def addvar(self, var):
        var.index = len(self.variables)
        self.variables.append(var)
        self.vardict[var.name] = var
    def getvar(self, varname):
        return self.vardict[varname]
    def get(self, var, row):
        if type(var) in (str, unicode):
            var = self.vardict[var]
        return row[var.index]
    def init(self, file):
        r = PorReader(file)
        r.skip(5)
        h = r.consumeOne()
        if not h.startswith(HEAD): raise Exception("Cannot read .por")
        numvars = readnum(h[len(HEAD):])
        h = r.skip(1)
        keep = r.consumeOne()
        while True:
            action = keep[0]
            #print "ACTION: %s" % action
            if action == '7':
                data = r.consume(8)
                while data[-2][0] <> 'C': data += r.consume()
                decimals = readnum(data[4])
                numeric = keep[1:] == '0'
                name, dummy = splitstring(data[:2])
                labellen, label = data[-2:]
                label, keep = splitstring(labellen[1:], label, r)
                v = SPSSVariable(name, label, numeric, decimals)
                self.addvar(v)
                #print "ADDED VAR ", v, data, `keep`, labellen[1:]
            if action == 'D': # value labels
                numvars = readnum(keep[1:])
                varnames = []
                keep = r.consumeOne()
                for i in range(numvars):
                    name, keep = splitstring(keep, r.consumeOne(), reader=r)
                    varnames.append(name)
                numlabels = readnum(keep)
                keep = r.consumeOne()
                labels = {}
                numeric = self.getvar(varnames[0]).numeric
                for i in range(numlabels):
                    if numeric:
                        val = readnum(keep)
                        name, keep = splitstring(reader=r)
                    else:
                        val, keep = splitstring(keep, r.consumeOne(), reader=r)
                        name, keep = splitstring(keep, r.consumeOne(), reader=r)
                    labels[val] = name
                #print "VALUE LABELS", varnames, labels
                for varname in varnames:
                    self.getvar(varname).valuelabels = labels
            if action == 'F': # data
                keep = keep[1:]
                while True:
                    row = []
                    for var in self.variables:
                        if not keep: keep = r.consumeOne()
                        if keep.startswith("Z"):
                            return
                        if var.numeric:
                            if keep.startswith("*."):
                                row.append(None)
                                keep = keep[2:]
                            else:
                                try:
                                    row.append(readnum(keep))
                                except Exception, e:
                                    print row
                                    print "Exception on %s" % var
                                    raise e
                                keep = ""
                        else:
                            slen = keep
                            x, keep = splitstring(slen, r.consumeOne())
                            row.append(x)
                    self.data.append(tuple(row))
            if action == 'Z': # data
                print "Done!"
                return

def _codec(str_in, base_from=36, base_to=10):
    """
    Base36 Encoder/Decoder
    by Mike Crute (mcrute@gmail.com) on August 26, 2008
    This code has been placed in the public domain.
    """
    ASCII = { "0": 48, "9": 57, "A": 65, "Z": 90 }
    # There are 8 characters between 9 and A
    from_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_from)
                            if (x >= ASCII["0"] and x <= ASCII["9"]) or
                               (x >= ASCII["A"] and x <= ASCII["Z"])][:base_from]
    to_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_to)
                            if (x >= ASCII["0"] and x <= ASCII["9"]) or
                               (x >= ASCII["A"] and x <= ASCII["Z"])][:base_to]
    x = long(0)
    for digit in str(str_in).upper():
        x = x * len(from_digits) + from_digits.index(digit)
    result = ""
    # This is going to assemble our number in reverse order
    # so we'll have to fix it before we return it
    while x > 0:
        result += to_digits[x % len(to_digits)]
        x /= len(to_digits)
    return result[::-1]

def decode(s):
    while s.startswith("0"): s = s[1:]
    if not s: return 0
    try:
        return int(_codec(s, 30, 10))
    except ValueError, e:
        raise ValueError("Cannot decode %r: %s" % (s, e))


def readnum(s):
    neg = s.startswith("-")
    if neg: s = s[1:]
    if "+" in s:
        num, exp = map(decode, s.split("+"))
        result = 30**exp
    elif "-" in s:
        num, exp = map(decode, s.split("-"))
        result = 1. / (30**exp)
    else:
        if "." in s:
            i, d = s.split(".")
        else:
            i, d = s, None
        result = decode(i)
        if d:
            for j, digit in enumerate(d):
                result += decode(digit) / 30.**(j+1)
    return result * (-1 if neg else 1)



if __name__ == '__main__':
    import sys
    fn = sys.argv[1]
    f = SPSSFile(fn)
    print len(f.variables), len(f.data)

Issues: 1) Initially I thought that the format was '/' separated, but it turns out that separation depends on the variable you expect, eg strings are generally <string length>/<string> with the next field starting immediately after the string without an extra '/'. This necessitated using a 'keep' variable in between fields. It would be better to refactor with a character based token reader rather than a '/'-splitter. 2) This opens the whole file in memory. For most uses, it would probably be better to read the variable definitions and have an iterator for the rows. 3) I ignore some things I don't use, such as decimal places, user-defined missing values, etc. 4) This code might not work on all files as I don't have the specs, I was glad I got it to work on my files :-)

Mail me if this is of use, I might have made an improved version in the meantime.

1 comment

Albert-Jan Roskam 12 years, 11 months ago  # | flag

Hi,

I just added a program to read spss .sav files. Check out http://code.activestate.com/recipes/577650-python-reader-for-spss-sav-files/

Albert-Jan Roskam