A simple in-memory numeric database for Python. Arrays are used to minimise memory consumption.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | import array, cPickle
class pynumgrid:
'In-memory number array database'
def __init__( self, schema=None, fname=None ):
'''schema -> 'field1:atype field2:atype ...' where atype is a
python array type code (see eg below)'''
if fname:
self.load( fname )
else:
if type(schema) == str:
schema = [ f.split(':') for f in schema.split() ]
self.schema = schema
self.flist = [ f for f,t in schema ]
self.data = dict( [(f, array.array(t)) for f,t in schema] )
self.rowcount = 0
def insert( self, record, recnum=None ):
'Insert one record (or append if no rec number)'
if recnum == None:
for i, value in enumerate( record ):
self.data[ self.flist[i] ].append( value )
else:
for i, value in enumerate( record ):
self.data[ self.flist[i] ].insert( value, recnum )
self.rowcount += 1
def update( self, record, recnum ):
'Change the record at recnum with updated values'
for i, value in enumerate( record ):
self.data[ self.flist[i] ][ recnum ] = value
def delete( self, recnum ):
'Delete the record ar recnum, and return it'
record = []
for field in self.flist:
record.append( self.data[ field ].pop( recnum ) )
self.rowcount -= 1
return record
def extend( self, colset ):
'Add a set of columns to the database (fast)'
for i, col in enumerate( colset ):
self.data[ self.flist[i] ].extend( col )
self.rowcount += len( colset[0] )
def select( self, query, ns={} ):
'Execute a python expression using data and ns namespace'
ns.update( dict( _en=enumerate, _rc=xrange(self.rowcount),
array=array.array ) )
return eval( query, self.data, ns )
def fetchlist( self, reclist, cols=[] ):
'Return a list of records given a list of record numbers'
cols = cols or self.flist
colset = [ self.data[f] for f in cols ]
return [ [d[i] for d in colset] for i in reclist ]
def save( self, fname ):
'Write the db to disk files (FAST!)'
cfg = (self.schema, self.rowcount)
cPickle.dump( cfg, open(fname+'.cfg','wb'), -1 )
for field in self.flist:
fh = open( '%s__%s.dat' % (fname,field), 'wb' )
self.data[ field ].tofile( fh )
def load( self, fname ):
'Read and/or build db on to this database (FAST!)'
cfg = cPickle.load( open(fname+'.cfg','rb') )
schema, newrowcount = cfg
if not hasattr( self, 'data' ):
self.__init__( schema )
for field in self.flist:
fh = open( '%s__%s.dat' % (fname,field), 'rb' )
try: self.data[ field ].fromfile( fh, newrowcount )
except EOFError: pass
self.rowcount += newrowcount
if __name__ == '__main__':
# Test with "prog -i -s" to build then "prog -l -q" to test a query.
import sys
# Age (years), height (cm), weight (kg) for a million people:
pyg = pynumgrid( 'age:B height:B weight:B' )
blocksize = 10**5
blocks = 10
if '-i' in sys.argv:
agelist = [ (i*1779+517)%80+18 for i in xrange(blocksize) ]
heightlist = [ (i*5111+711)%90+120 for i in xrange(blocksize) ]
weightlist = [ (i*3113+797)%100+50 for i in xrange(blocksize) ]
for j in xrange(blocks):
pyg.extend( (agelist, heightlist, weightlist) )
if '-s' in sys.argv:
pyg.save( '/tmp/pynumgrid' )
if '-l' in sys.argv:
pyg.load( '/tmp/pynumgrid' )
if '-q' in sys.argv:
print 'The maximum height (in cm) of people in their 30s:',
print pyg.select('max(height[i] for i in _rc if 29<age[i]<40)')
print pyg.fetchlist( range(100,110), cols=['weight','height'] )
|
This is a simple solution for managing data stores with millions of rows on a limited memory machine. Lists of tuples became too big, and I did not have time to integrate a larger package (like numpy).
It saves and loads very fast (<100 milliseconds). You can ingest data and run queries in a second or two. Note that the select mechanism is based on eval(), so is unsafe in any open access purpose.
For more serious applications I recommend numpy or pytables. They are faster, well tested, and well supported.