Python reader for SPSS .por files « Python recipes

SPSS can output ASCII .por files that include variable definitions (labels, value labels) and data. These files are useful to be able to read but strange enough I couldn't find any documentation anywhere.

I've reverse engineered the format so that it works for the (fairly complex) files I need to open, so maybe it is of some use for other people interfacing SPSS with python.

This code is placed in the public domain as far as allowed by law, parts (if any) that cannot be relased in the public domain are irrevocably licensed to all readers under the Creative Commons license without restrictions ... but if you improve it it would be great if you could share it. The code uses a base N decoder plucked from the web somewhere (can't find it) that was also public domain.

-- wouter (wouter@vanatteveldt.com)

      DEL = '/'
class PorReader(object):
    def __init__(self, file):
        if type(file) in (str, unicode): file = open(file)
        self.file = file
        self.pos = -1
        self.buffer = ""
    def consumeOne(self, skip=False):
        p = self.buffer.find(DEL, self.pos+1)
        output = ""
        while p == -1:
            if not skip: output += self.buffer[self.pos+1:]
            self.buffer = self.file.read(1024)
            self.pos = -1
            p = self.buffer.find(DEL, self.pos+1)
            if not self.buffer: break
        if not skip: output += self.buffer[self.pos+1:p]
        self.pos = p
        if not skip:
            output = output.replace("\r\n", "")
            return output
    def consume(self, n=1):
        return [self.consumeOne() for i in range(n)]
    def skip(self, n=1):
        for i in range(n):
            self.consumeOne(skip=True)

HEAD = 'SPSS for Microsoft Windows Release 15.04'

FLOAT, STR, INT = 0,1,2

class SPSSVariable(object):
    def __init__(self, name, label=None, numeric=True, decimals=0):
        self.name = name
        self.label = label
        self.numeric = numeric
        self.decimals = decimals
        self.valuelabels = None
        self.index = None
    def __str__(self):
        t = 'S'
        if self.numeric: t = 'I'
        if self.numeric and self.decimals: t = 'F'
        return "%s%s%s" % (self.name, (' "%s" ' % self.label if self.label else ''),t)

def splitstring(slen=None, s=None, reader=None):
    if slen is None:
        slen = reader.consume(2)
    if s is None: slen, s = slen
    if type(slen) == str: slen = readnum(slen)
    while slen > len(s):
        if reader:
            s += "/"+reader.consumeOne()
        else:
            raise Exception("!")
    keep = s[slen:]
    s = s[:slen]
    return s, keep

class SPSSFile(object):
    def __init__(self, file):
        self.variables = []
        self.vardict = {}
        self.data = []
        self.init(file)
    def addvar(self, var):
        var.index = len(self.variables)
        self.variables.append(var)
        self.vardict[var.name] = var
    def getvar(self, varname):
        return self.vardict[varname]
    def get(self, var, row):
        if type(var) in (str, unicode):
            var = self.vardict[var]
        return row[var.index]
    def init(self, file):
        r = PorReader(file)
        r.skip(5)
        h = r.consumeOne()
        if not h.startswith(HEAD): raise Exception("Cannot read .por")
        numvars = readnum(h[len(HEAD):])
        h = r.skip(1)
        keep = r.consumeOne()
        while True:
            action = keep[0]
            #print "ACTION: %s" % action
            if action == '7':
                data = r.consume(8)
                while data[-2][0] <> 'C': data += r.consume()
                decimals = readnum(data[4])
                numeric = keep[1:] == '0'
                name, dummy = splitstring(data[:2])
                labellen, label = data[-2:]
                label, keep = splitstring(labellen[1:], label, r)
                v = SPSSVariable(name, label, numeric, decimals)
                self.addvar(v)
                #print "ADDED VAR ", v, data, `keep`, labellen[1:]
            if action == 'D': # value labels
                numvars = readnum(keep[1:])
                varnames = []
                keep = r.consumeOne()
                for i in range(numvars):
                    name, keep = splitstring(keep, r.consumeOne(), reader=r)
                    varnames.append(name)
                numlabels = readnum(keep)
                keep = r.consumeOne()
                labels = {}
                numeric = self.getvar(varnames[0]).numeric
                for i in range(numlabels):
                    if numeric:
                        val = readnum(keep)
                        name, keep = splitstring(reader=r)
                    else:
                        val, keep = splitstring(keep, r.consumeOne(), reader=r)
                        name, keep = splitstring(keep, r.consumeOne(), reader=r)
                    labels[val] = name
                #print "VALUE LABELS", varnames, labels
                for varname in varnames:
                    self.getvar(varname).valuelabels = labels
            if action == 'F': # data
                keep = keep[1:]
                while True:
                    row = []
                    for var in self.variables:
                        if not keep: keep = r.consumeOne()
                        if keep.startswith("Z"):
                            return
                        if var.numeric:
                            if keep.startswith("*."):
                                row.append(None)
                                keep = keep[2:]
                            else:
                                try:
                                    row.append(readnum(keep))
                                except Exception, e:
                                    print row
                                    print "Exception on %s" % var
                                    raise e
                                keep = ""
                        else:
                            slen = keep
                            x, keep = splitstring(slen, r.consumeOne())
                            row.append(x)
                    self.data.append(tuple(row))
            if action == 'Z': # data
                print "Done!"
                return

def _codec(str_in, base_from=36, base_to=10):
    """
    Base36 Encoder/Decoder
    by Mike Crute (mcrute@gmail.com) on August 26, 2008
    This code has been placed in the public domain.
    """
    ASCII = { "0": 48, "9": 57, "A": 65, "Z": 90 }
    # There are 8 characters between 9 and A
    from_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_from)
                            if (x >= ASCII["0"] and x <= ASCII["9"]) or
                               (x >= ASCII["A"] and x <= ASCII["Z"])][:base_from]
    to_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_to)
                            if (x >= ASCII["0"] and x <= ASCII["9"]) or
                               (x >= ASCII["A"] and x <= ASCII["Z"])][:base_to]
    x = long(0)
    for digit in str(str_in).upper():
        x = x * len(from_digits) + from_digits.index(digit)
    result = ""
    # This is going to assemble our number in reverse order
    # so we'll have to fix it before we return it
    while x > 0:
        result += to_digits[x % len(to_digits)]
        x /= len(to_digits)
    return result[::-1]

def decode(s):
    while s.startswith("0"): s = s[1:]
    if not s: return 0
    try:
        return int(_codec(s, 30, 10))
    except ValueError, e:
        raise ValueError("Cannot decode %r: %s" % (s, e))


def readnum(s):
    neg = s.startswith("-")
    if neg: s = s[1:]
    if "+" in s:
        num, exp = map(decode, s.split("+"))
        result = 30**exp
    elif "-" in s:
        num, exp = map(decode, s.split("-"))
        result = 1. / (30**exp)
    else:
        if "." in s:
            i, d = s.split(".")
        else:
            i, d = s, None
        result = decode(i)
        if d:
            for j, digit in enumerate(d):
                result += decode(digit) / 30.**(j+1)
    return result * (-1 if neg else 1)



if __name__ == '__main__':
    import sys
    fn = sys.argv[1]
    f = SPSSFile(fn)
    print len(f.variables), len(f.data)

      

Issues: 1) Initially I thought that the format was '/' separated, but it turns out that separation depends on the variable you expect, eg strings are generally <string length>/<string> with the next field starting immediately after the string without an extra '/'. This necessitated using a 'keep' variable in between fields. It would be better to refactor with a character based token reader rather than a '/'-splitter. 2) This opens the whole file in memory. For most uses, it would probably be better to read the variable definitions and have an iterator for the rows. 3) I ignore some things I don't use, such as decimal places, user-defined missing values, etc. 4) This code might not work on all files as I don't have the specs, I was glad I got it to work on my files :-)

Mail me if this is of use, I might have made an improved version in the meantime.

Tags: por, portable, reader, spp, value_labels

1 comment

Albert-Jan Roskam 13 years ago # | flag

Hi,

I just added a program to read spss .sav files. Check out http://code.activestate.com/recipes/577650-python-reader-for-spss-sav-files/

Albert-Jan Roskam

◄	Python recipes (4591)	►
◄	wouter's recipes (1)	►

Python reader for SPSS .por files (Python recipe) by wouter
ActiveState Code (http://code.activestate.com/recipes/576809/)

1 comment

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Python reader for SPSS .por files (Python recipe) by wouter ActiveState Code (http://code.activestate.com/recipes/576809/)

1 comment

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Python reader for SPSS .por files (Python recipe) by wouter
ActiveState Code (http://code.activestate.com/recipes/576809/)