This fork assumes a desire for limited selection of field names. With huge files this might be necessary on some machines.
Also, assuming that the meaning of null in a dbf file means zero might be a mistake, so the fork adds an argument nullreplace as way to choose what to replace null with. Null is sometimes used to mean missing value. This change is decoupled from the selective names feature.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | import struct, datetime, decimal, itertools
from collections import namedtuple
FI = namedtuple('FieldInfo', ('name', 'typ', 'size', 'deci',
'fmt', 'fmtsiz', 'keep', 'seekme'))
def dbfreader(f, names, nullreplace=None):
"""Returns an iterator over records in a Xbase DBF file.
The first row returned contains the field names. The second row
contains field specs: (type, size, decimal places). Subsequent rows
contain the data records. If a record is marked as deleted, it is
skipped.
names is the field names to extract. The value of nullreplace is
used with data of type 'N' as a replacement for '\0'.
File should be opened for binary reads.
"""
# See DBF format spec at:
# http://www.pgts.com.au/download/public/xbase.htm#DBF_STRUCT
numrec, lenheader = struct.unpack('<xxxxLH22x', f.read(32))
numfields = (lenheader - 33) // 32
fields = [FI('DeletionFlag', 'C', 1, 0,
'1s', struct.calcsize('1s'), True, 0)] # discarded in main loop
for fieldno in xrange(numfields):
name, typ, size, deci = struct.unpack('<11sc4xBB14x', f.read(32))
name = name.replace('\0', '') # eliminate NULs from string
fmt = str(size) + 's'
prev = fields[fieldno]
fi = FI(name, typ, size, deci, fmt, struct.calcsize(fmt), name in names,
prev.seekme + prev.size)
fields.append(fi)
selfields = [field for field in fields if field.keep]
yield [field.name for field in selfields[1:]]
yield [tuple(field[1:4]) for field in selfields[1:]]
terminator = f.read(1)
assert terminator == '\r'
for i in xrange(numrec):
refaddr = f.tell()
record = []
for field in selfields:
f.seek(refaddr + field.seekme)
record.append(struct.unpack(field.fmt, f.read(field.fmtsiz))[0])
if record[0] != ' ':
continue # deleted record
result = []
for sf, value in itertools.izip(selfields, record):
if sf.name == 'DeletionFlag':
continue
if sf.typ == "N":
value = value.replace('\0', '').lstrip()
if value == '':
value = nullreplace
elif sf.deci:
value = decimal.Decimal(value)
else:
value = int(value)
elif sf.typ == 'D':
y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8])
value = datetime.date(y, m, d)
elif sf.typ == 'L':
value = (value in 'YyTt' and 'T') or (value in 'NnFf' and 'F') or '?'
elif sf.typ == 'F':
value = float(value)
result.append(value)
f.seek(refaddr + fields[-1].seekme + fields[-1].fmtsiz)
yield result
def dbfwriter(f, fieldnames, fieldspecs, records, nullreplace=None):
"""Return a string suitable for writing directly to a binary dbf file.
File f should be open for writing in a binary mode.
Fieldnames should be no longer than ten characters and not include \x00.
Fieldspecs are in the form (type, size, deci) where
type is one of:
C for ascii character data
M for ascii character memo data (real memo fields not supported)
D for datetime objects
N for ints or decimal objects
L for logical values 'T', 'F', or '?'
size is the field width
deci is the number of decimal places in the provided decimal object
Records can be an iterable over the records (sequences of field values).
The value of nullreplace is compared with values of type N and, if
equal, replaced with '\0' in the output.
"""
# header info
ver = 3
now = datetime.datetime.now()
yr, mon, day = now.year-1900, now.month, now.day
numrec = len(records)
numfields = len(fieldspecs)
lenheader = numfields * 32 + 33
lenrecord = sum(field[1] for field in fieldspecs) + 1
hdr = struct.pack('<BBBBLHH20x', ver, yr, mon, day, numrec, lenheader, lenrecord)
f.write(hdr)
# field specs
for name, (typ, size, deci) in itertools.izip(fieldnames, fieldspecs):
name = name.ljust(11, '\x00')
fld = struct.pack('<11sc4xBB14x', name, typ, size, deci)
f.write(fld)
# terminator
f.write('\r')
# records
for record in records:
f.write(' ') # deletion flag
for (typ, size, deci), value in itertools.izip(fieldspecs, record):
if typ == "N":
if value != nullreplace:
value = str(value).rjust(size, ' ')
else:
value = '\0'.rjust(size, ' ')
elif typ == 'D':
value = value.strftime('%Y%m%d')
elif typ == 'L':
value = str(value)[0].upper()
else:
value = str(value)[:size].ljust(size, ' ')
assert len(value) == size
f.write(value)
# End of file
f.write('\x1A')
|
The dbf file format is old, precise, commonplace, and widely supported by everything from calendar software in PDAs, to contact managers, to Excel and Access. It can be a good way to get Python to interoperate with pre-existing, non-Python apps.