Welcome, guest | Sign In | My Account | Store | Cart

A simple in-memory numeric database for Python. Arrays are used to minimise memory consumption.

Python, 91 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import array, cPickle

class pynumgrid:
	'In-memory number array database'
	def __init__( self, schema=None, fname=None ):
		'''schema -> 'field1:atype field2:atype ...' where atype is a
			python array type code (see eg below)'''
		if fname:
			self.load( fname )
		else:
			if type(schema) == str:
				schema = [ f.split(':') for f in schema.split() ]
			self.schema = schema
			self.flist = [ f for f,t in schema ]
			self.data = dict( [(f, array.array(t)) for f,t in schema] )
			self.rowcount = 0
	def insert( self, record, recnum=None ):
		'Insert one record (or append if no rec number)'
		if recnum == None:
			for i, value in enumerate( record ):
				self.data[ self.flist[i] ].append( value )
		else:
			for i, value in enumerate( record ):
				self.data[ self.flist[i] ].insert( value, recnum )
		self.rowcount += 1
	def update( self, record, recnum ):
		'Change the record at recnum with updated values'
		for i, value in enumerate( record ):
			self.data[ self.flist[i] ][ recnum ] = value
	def delete( self, recnum ):
		'Delete the record ar recnum, and return it'
		record = []
		for field in self.flist:
			record.append( self.data[ field ].pop( recnum ) )
		self.rowcount -= 1
		return record
	def extend( self, colset ):
		'Add a set of columns to the database (fast)'
		for i, col in enumerate( colset ):
			self.data[ self.flist[i] ].extend( col )
		self.rowcount += len( colset[0] )
	def select( self, query, ns={} ):
		'Execute a python expression using data and ns namespace'
		ns.update( dict( _en=enumerate, _rc=xrange(self.rowcount),
			array=array.array ) )
		return eval( query, self.data, ns )
	def fetchlist( self, reclist, cols=[] ):
		'Return a list of records given a list of record numbers'
		cols = cols or self.flist
		colset = [ self.data[f] for f in cols ]
		return [ [d[i] for d in colset] for i in reclist ]
	def save( self, fname ):
		'Write the db to disk files (FAST!)'
		cfg = (self.schema, self.rowcount)
		cPickle.dump( cfg, open(fname+'.cfg','wb'), -1 )
		for field in self.flist:
			fh = open( '%s__%s.dat' % (fname,field), 'wb' )
			self.data[ field ].tofile( fh )
	def load( self, fname ):
		'Read and/or build db on to this database (FAST!)'
		cfg = cPickle.load( open(fname+'.cfg','rb') )
		schema, newrowcount = cfg
		if not hasattr( self, 'data' ):
			self.__init__( schema )
		for field in self.flist:
			fh = open( '%s__%s.dat' % (fname,field), 'rb' )
			try: self.data[ field ].fromfile( fh, newrowcount )
			except EOFError: pass
		self.rowcount += newrowcount

if __name__ == '__main__':
	# Test with "prog -i -s" to build then "prog -l -q" to test a query.
	import sys
	# Age (years), height (cm), weight (kg) for a million people:
	pyg = pynumgrid( 'age:B height:B weight:B' )
	blocksize = 10**5
	blocks = 10
	if '-i' in sys.argv:
		agelist = [ (i*1779+517)%80+18 for i in xrange(blocksize) ]
		heightlist = [ (i*5111+711)%90+120 for i in xrange(blocksize) ]
		weightlist = [ (i*3113+797)%100+50 for i in xrange(blocksize) ]
		for j in xrange(blocks):
			pyg.extend( (agelist, heightlist, weightlist) )
	if '-s' in sys.argv:
		pyg.save( '/tmp/pynumgrid' )
	if '-l' in sys.argv:
		pyg.load( '/tmp/pynumgrid' )
	if '-q' in sys.argv:
		print 'The maximum height (in cm) of people in their 30s:',
		print pyg.select('max(height[i] for i in _rc if 29<age[i]<40)')
		print pyg.fetchlist( range(100,110), cols=['weight','height'] )

This is a simple solution for managing data stores with millions of rows on a limited memory machine. Lists of tuples became too big, and I did not have time to integrate a larger package (like numpy).

It saves and loads very fast (<100 milliseconds). You can ingest data and run queries in a second or two. Note that the select mechanism is based on eval(), so is unsafe in any open access purpose.

For more serious applications I recommend numpy or pytables. They are faster, well tested, and well supported.