Welcome, guest | Sign In | My Account | Store | Cart

This fork assumes a desire for limited selection of field names. With huge files this might be necessary on some machines.

Also, assuming that the meaning of null in a dbf file means zero might be a mistake, so the fork adds an argument nullreplace as way to choose what to replace null with. Null is sometimes used to mean missing value. This change is decoupled from the selective names feature.

Python, 138 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import struct, datetime, decimal, itertools
from collections import namedtuple

FI = namedtuple('FieldInfo', ('name', 'typ', 'size', 'deci',
                              'fmt', 'fmtsiz', 'keep', 'seekme'))

def dbfreader(f, names, nullreplace=None):
    """Returns an iterator over records in a Xbase DBF file.

    The first row returned contains the field names. The second row
    contains field specs: (type, size, decimal places). Subsequent rows
    contain the data records. If a record is marked as deleted, it is
    skipped.

    names is the field names to extract. The value of nullreplace is
    used with data of type 'N' as a replacement for '\0'.

    File should be opened for binary reads.

    """
    # See DBF format spec at:
    # http://www.pgts.com.au/download/public/xbase.htm#DBF_STRUCT

    numrec, lenheader = struct.unpack('<xxxxLH22x', f.read(32))
    numfields = (lenheader - 33) // 32

    fields = [FI('DeletionFlag', 'C', 1, 0,
                 '1s', struct.calcsize('1s'), True, 0)] # discarded in main loop

    for fieldno in xrange(numfields):
        name, typ, size, deci = struct.unpack('<11sc4xBB14x', f.read(32))
        name = name.replace('\0', '')       # eliminate NULs from string
        fmt = str(size) + 's'
        prev = fields[fieldno]
        fi = FI(name, typ, size, deci, fmt, struct.calcsize(fmt), name in names,
                prev.seekme + prev.size)
        fields.append(fi)

    selfields = [field for field in fields if field.keep]
    yield [field.name for field in selfields[1:]]
    yield [tuple(field[1:4]) for field in selfields[1:]]

    terminator = f.read(1)
    assert terminator == '\r'

    for i in xrange(numrec):
        refaddr = f.tell()
        record = []
        for field in selfields:
            f.seek(refaddr + field.seekme)
            record.append(struct.unpack(field.fmt, f.read(field.fmtsiz))[0])

        if record[0] != ' ':
            continue                        # deleted record
        result = []
        for sf, value in itertools.izip(selfields, record):
            if sf.name == 'DeletionFlag':
                continue
            if sf.typ == "N":
                value = value.replace('\0', '').lstrip()
                if value == '':
                    value = nullreplace
                elif sf.deci:
                    value = decimal.Decimal(value)
                else:
                    value = int(value)
            elif sf.typ == 'D':
                y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8])
                value = datetime.date(y, m, d)
            elif sf.typ == 'L':
                value = (value in 'YyTt' and 'T') or (value in 'NnFf' and 'F') or '?'
            elif sf.typ == 'F':
                value = float(value)
            result.append(value)
        f.seek(refaddr + fields[-1].seekme + fields[-1].fmtsiz)
        yield result

def dbfwriter(f, fieldnames, fieldspecs, records, nullreplace=None):
    """Return a string suitable for writing directly to a binary dbf file.

    File f should be open for writing in a binary mode.

    Fieldnames should be no longer than ten characters and not include \x00.
    Fieldspecs are in the form (type, size, deci) where
        type is one of:
            C for ascii character data
            M for ascii character memo data (real memo fields not supported)
            D for datetime objects
            N for ints or decimal objects
            L for logical values 'T', 'F', or '?'
        size is the field width
        deci is the number of decimal places in the provided decimal object
    Records can be an iterable over the records (sequences of field values).

    The value of nullreplace is compared with values of type N and, if
    equal, replaced with '\0' in the output.

    """
    # header info
    ver = 3
    now = datetime.datetime.now()
    yr, mon, day = now.year-1900, now.month, now.day
    numrec = len(records)
    numfields = len(fieldspecs)
    lenheader = numfields * 32 + 33
    lenrecord = sum(field[1] for field in fieldspecs) + 1
    hdr = struct.pack('<BBBBLHH20x', ver, yr, mon, day, numrec, lenheader, lenrecord)
    f.write(hdr)

    # field specs
    for name, (typ, size, deci) in itertools.izip(fieldnames, fieldspecs):
        name = name.ljust(11, '\x00')
        fld = struct.pack('<11sc4xBB14x', name, typ, size, deci)
        f.write(fld)

    # terminator
    f.write('\r')

    # records
    for record in records:
        f.write(' ')                        # deletion flag
        for (typ, size, deci), value in itertools.izip(fieldspecs, record):
            if typ == "N":
                if value != nullreplace:
                    value = str(value).rjust(size, ' ')
                else:
                    value = '\0'.rjust(size, ' ')
            elif typ == 'D':
                value = value.strftime('%Y%m%d')
            elif typ == 'L':
                value = str(value)[0].upper()
            else:
                value = str(value)[:size].ljust(size, ' ')
            assert len(value) == size
            f.write(value)

    # End of file
    f.write('\x1A')

The dbf file format is old, precise, commonplace, and widely supported by everything from calendar software in PDAs, to contact managers, to Excel and Access. It can be a good way to get Python to interoperate with pre-existing, non-Python apps.