DEL = '/' class PorReader(object): def __init__(self, file): if type(file) in (str, unicode): file = open(file) self.file = file self.pos = -1 self.buffer = "" def consumeOne(self, skip=False): p = self.buffer.find(DEL, self.pos+1) output = "" while p == -1: if not skip: output += self.buffer[self.pos+1:] self.buffer = self.file.read(1024) self.pos = -1 p = self.buffer.find(DEL, self.pos+1) if not self.buffer: break if not skip: output += self.buffer[self.pos+1:p] self.pos = p if not skip: output = output.replace("\r\n", "") return output def consume(self, n=1): return [self.consumeOne() for i in range(n)] def skip(self, n=1): for i in range(n): self.consumeOne(skip=True) HEAD = 'SPSS for Microsoft Windows Release 15.04' FLOAT, STR, INT = 0,1,2 class SPSSVariable(object): def __init__(self, name, label=None, numeric=True, decimals=0): self.name = name self.label = label self.numeric = numeric self.decimals = decimals self.valuelabels = None self.index = None def __str__(self): t = 'S' if self.numeric: t = 'I' if self.numeric and self.decimals: t = 'F' return "%s%s%s" % (self.name, (' "%s" ' % self.label if self.label else ''),t) def splitstring(slen=None, s=None, reader=None): if slen is None: slen = reader.consume(2) if s is None: slen, s = slen if type(slen) == str: slen = readnum(slen) while slen > len(s): if reader: s += "/"+reader.consumeOne() else: raise Exception("!") keep = s[slen:] s = s[:slen] return s, keep class SPSSFile(object): def __init__(self, file): self.variables = [] self.vardict = {} self.data = [] self.init(file) def addvar(self, var): var.index = len(self.variables) self.variables.append(var) self.vardict[var.name] = var def getvar(self, varname): return self.vardict[varname] def get(self, var, row): if type(var) in (str, unicode): var = self.vardict[var] return row[var.index] def init(self, file): r = PorReader(file) r.skip(5) h = r.consumeOne() if not h.startswith(HEAD): raise Exception("Cannot read .por") numvars = readnum(h[len(HEAD):]) h = r.skip(1) keep = r.consumeOne() while True: action = keep[0] #print "ACTION: %s" % action if action == '7': data = r.consume(8) while data[-2][0] <> 'C': data += r.consume() decimals = readnum(data[4]) numeric = keep[1:] == '0' name, dummy = splitstring(data[:2]) labellen, label = data[-2:] label, keep = splitstring(labellen[1:], label, r) v = SPSSVariable(name, label, numeric, decimals) self.addvar(v) #print "ADDED VAR ", v, data, `keep`, labellen[1:] if action == 'D': # value labels numvars = readnum(keep[1:]) varnames = [] keep = r.consumeOne() for i in range(numvars): name, keep = splitstring(keep, r.consumeOne(), reader=r) varnames.append(name) numlabels = readnum(keep) keep = r.consumeOne() labels = {} numeric = self.getvar(varnames[0]).numeric for i in range(numlabels): if numeric: val = readnum(keep) name, keep = splitstring(reader=r) else: val, keep = splitstring(keep, r.consumeOne(), reader=r) name, keep = splitstring(keep, r.consumeOne(), reader=r) labels[val] = name #print "VALUE LABELS", varnames, labels for varname in varnames: self.getvar(varname).valuelabels = labels if action == 'F': # data keep = keep[1:] while True: row = [] for var in self.variables: if not keep: keep = r.consumeOne() if keep.startswith("Z"): return if var.numeric: if keep.startswith("*."): row.append(None) keep = keep[2:] else: try: row.append(readnum(keep)) except Exception, e: print row print "Exception on %s" % var raise e keep = "" else: slen = keep x, keep = splitstring(slen, r.consumeOne()) row.append(x) self.data.append(tuple(row)) if action == 'Z': # data print "Done!" return def _codec(str_in, base_from=36, base_to=10): """ Base36 Encoder/Decoder by Mike Crute (mcrute@gmail.com) on August 26, 2008 This code has been placed in the public domain. """ ASCII = { "0": 48, "9": 57, "A": 65, "Z": 90 } # There are 8 characters between 9 and A from_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_from) if (x >= ASCII["0"] and x <= ASCII["9"]) or (x >= ASCII["A"] and x <= ASCII["Z"])][:base_from] to_digits = [chr(x) for x in range(ASCII["0"], ASCII["9"] + 8 + base_to) if (x >= ASCII["0"] and x <= ASCII["9"]) or (x >= ASCII["A"] and x <= ASCII["Z"])][:base_to] x = long(0) for digit in str(str_in).upper(): x = x * len(from_digits) + from_digits.index(digit) result = "" # This is going to assemble our number in reverse order # so we'll have to fix it before we return it while x > 0: result += to_digits[x % len(to_digits)] x /= len(to_digits) return result[::-1] def decode(s): while s.startswith("0"): s = s[1:] if not s: return 0 try: return int(_codec(s, 30, 10)) except ValueError, e: raise ValueError("Cannot decode %r: %s" % (s, e)) def readnum(s): neg = s.startswith("-") if neg: s = s[1:] if "+" in s: num, exp = map(decode, s.split("+")) result = 30**exp elif "-" in s: num, exp = map(decode, s.split("-")) result = 1. / (30**exp) else: if "." in s: i, d = s.split(".") else: i, d = s, None result = decode(i) if d: for j, digit in enumerate(d): result += decode(digit) / 30.**(j+1) return result * (-1 if neg else 1) if __name__ == '__main__': import sys fn = sys.argv[1] f = SPSSFile(fn) print len(f.variables), len(f.data)