PyDbLite, a small in-memory database engine « Python recipes

A small, fast, in-memory database management program

The database object supports the iterator protocol, so that requests can be expressed with list comprehensions or generator expressions instead of SQL. The equivalent of :

cursor.execute("SELECT name FROM table WHERE age=30")
rows = cursor.fetchall()

is :

rows = table(age=30)

The module stores data in a cPickled file. Records are indexed by a unique record identifier, that can be used for direct access. Since operations are processed in memory they are extremely fast, nearly as fast as SQLite in the few tests I made, and MUCH faster than other pure-Python modules such as Gadfly or KirbyBase. An index can be created on a field to even speed up selections

Concurrency control is supported by a version number set for each record

Complete documentation is here

      """PyDbLite.py

In-memory database management, with selection by list comprehension 
or generator expression

Fields are untyped : they can store anything that can be pickled.
Selected records are returned as dictionaries. Each record is 
identified by a unique id and has a version number incremented
at every record update, to detect concurrent access

Syntax :
    from PyDbLite import Base
    db = Base('dummy')
    # create new base with field names
    db.create('name','age','size')
    # existing base
    db.open()
    # insert new record
    db.insert(name='homer',age=23,size=1.84)
    # records are dictionaries with a unique integer key __id__
    # selection by list comprehension
    res = [ r for r in db if 30 > r['age'] >= 18 and r['size'] < 2 ]
    # or generator expression
    for r in (r for r in db if r['name'] in ('homer','marge') ):
    # simple selection (equality test)
    res = db(age=30)
    # delete a record or a list of records
    db.delete(one_record)
    db.delete(list_of_records)
    # delete a record by its id
    del db[rec_id]
    # direct access by id
    record = db[rec_id] # the record such that record['__id__'] == rec_id
    # create an index on a field
    db.create_index('age')
    # access by index
    records = db._age[23] # returns the list of records with age == 23
    # update
    db.update(record,age=24)
    # add and drop fields
    db.add_field('new_field')
    db.drop_field('name')
    # save changes on disk
    db.commit()
"""

import os
import cPickle
import bisect

# compatibility with Python 2.3
try:
    set([])
except NameError:
    from sets import Set as set
    
class Index:
    """Class used for indexing a base on a field
    The instance of Index is an attribute the Base instance"""

    def __init__(self,db,field):
        self.db = db # database object (instance of Base)
        self.field = field # field name

    def __iter__(self):
        return iter(self.db.indices[self.field])

    def keys(self):
        return self.db.indices[self.field].keys()

    def __getitem__(self,key):
        """Lookup by key : return the list of records where
        field value is equal to this key, or an empty list"""
        ids = self.db.indices[self.field].get(key,[])
        return [ self.db.records[_id] for _id in ids ]

class Base:

    def __init__(self,basename):
        self.name = basename

    def create(self,*fields,**kw):
        """Create a new base with specified field names
        A keyword argument mode can be specified ; it is used if a file
        with the base name already exists
        - if mode = 'open' : open the existing base, ignore the fields
        - if mode = 'override' : erase the existing base and create a
        new one with the specified fields"""
        self.mode = mode = kw.get("mode",None)
        if os.path.exists(self.name):
            if not os.path.isfile(self.name):
                raise IOError,"%s exists and is not a file" %self.name
            elif mode is None:
                raise IOError,"Base %s already exists" %self.name
            elif mode == "open":
                return self.open()
            elif mode == "override":
                os.remove(self.name)
        self.fields = list(fields)
        self.records = {}
        self.next_id = 0
        self.indices = {}
        self.commit()
        return self

    def create_index(self,*fields):
        """Create an index on the specified field names
        
        An index on a field is a mapping between the values taken by the field
        and the sorted list of the ids of the records whose field is equal to 
        this value
        
        For each indexed field, an attribute of self is created, an instance 
        of the class Index (see above). Its name it the field name, with the
        prefix _ to avoid name conflicts
        """
        reset = False
        for f in fields:
            if not f in self.fields:
                raise NameError,"%s is not a field name" %f
            # initialize the indices
            if self.mode == "open" and f in self.indices:
                continue
            reset = True
            self.indices[f] = {}
            for _id,record in self.records.iteritems():
                # use bisect to quickly insert the id in the list
                bisect.insort(self.indices[f].setdefault(record[f],[]),
                    _id)
            # create a new attribute of self, used to find the records
            # by this index
            setattr(self,'_'+f,Index(self,f))
        if reset:
            self.commit()

    def open(self):
        """Open an existing database and load its content into memory"""
        _in = open(self.name) # don't specify binary mode !
        self.fields = cPickle.load(_in)
        self.next_id = cPickle.load(_in)
        self.records = cPickle.load(_in)
        self.indices = cPickle.load(_in)
        for f in self.indices.keys():
            setattr(self,'_'+f,Index(self,f))
        _in.close()
        self.mode = "open"
        return self

    def commit(self):
        """Write the database to a file"""
        out = open(self.name,'wb')
        cPickle.dump(self.fields,out)
        cPickle.dump(self.next_id,out)
        cPickle.dump(self.records,out)
        cPickle.dump(self.indices,out)
        out.close()

    def insert(self,*args,**kw):
        """Insert a record in the database
        Parameters can be positional or keyword arguments. If positional
        they must be in the same order as in the create() method
        If some of the fields are missing the value is set to None
        Returns the record identifier
        """
        if args:
            kw = dict([(f,arg) for f,arg in zip(self.fields,args)])
        # initialize all fields to None
        record = dict([(f,None) for f in self.fields])
        # set keys and values
        for (k,v) in kw.iteritems():
            record[k]=v
        # add the key __id__ : record identifier
        record['__id__'] = self.next_id
        # add the key __version__ : version number
        record['__version__'] = 0
        # create an entry in the dictionary self.records, indexed by __id__
        self.records[self.next_id] = record
        # update index
        for ix in self.indices.keys():
            bisect.insort(self.indices[ix].setdefault(record[ix],[]),
                self.next_id)
        # increment the next __id__ to attribute
        self.next_id += 1
        return record['__id__']

    def delete(self,removed):
        """Remove a single record, or the records in an iterable
        Before starting deletion, test if all records are in the base
        and don't have twice the same __id__
        Return the number of deleted items
        """
        if isinstance(removed,dict):
            # remove a single record
            removed = [removed]
        else:
            # convert iterable into a list (to be able to sort it)
            removed = [ r for r in removed ]
        if not removed:
            return 0
        _ids = [ r['__id__'] for r in removed ]
        _ids.sort()
        keys = set(self.records.keys())
        # check if the records are in the base
        if not set(_ids).issubset(keys):
            missing = list(set(_ids).difference(keys))
            raise IndexError,'Delete aborted. Records with these ids' \
                ' not found in the base : %s' %str(missing)
        # raise exception if duplicate ids
        for i in range(len(_ids)-1):
            if _ids[i] == _ids[i+1]:
                raise IndexError,"Delete aborted. Duplicate id : %s" %_ids[i]
        deleted = len(removed)
        while removed:
            r = removed.pop()
            _id = r['__id__']
            # remove id from indices
            for indx in self.indices.keys():
                pos = bisect.bisect(self.indices[indx][r[indx]],_id)-1
                del self.indices[indx][r[indx]][pos]
                if not self.indices[indx][r[indx]]:
                    del self.indices[indx][r[indx]]
            # remove record from self.records
            del self.records[_id]
        return deleted

    def update(self,record,**kw):
        """Update the record with new keys and values and update indices"""
        # update indices
        _id = record["__id__"]
        for indx in self.indices.keys():
            if indx in kw.keys():
                if record[indx] == kw[indx]:
                    continue
                # remove id for the old value
                old_pos = bisect.bisect(self.indices[indx][record[indx]],_id)-1
                del self.indices[indx][record[indx]][old_pos]
                if not self.indices[indx][record[indx]]:
                    del self.indices[indx][record[indx]]
                # insert new value
                bisect.insort(self.indices[indx].setdefault(kw[indx],[]),_id)
        # update record values
        record.update(kw)
        # increment version number
        record["__version__"] += 1

    def add_field(self,field,default=None):
        if field in self.fields + ["__id__","__version__"]:
            raise ValueError,"Field %s already defined" %field
        for r in self:
            r[field] = default
        self.fields.append(field)
        self.commit()
    
    def drop_field(self,field):
        if field in ["__id__","__version__"]:
            raise ValueError,"Can't delete field %s" %field
        self.fields.remove(field)
        for r in self:
            del r[field]
        if field in self.indices:
            del self.indices[field]
        self.commit()

    def __call__(self,**kw):
        """Selection by field values
        db(key=value) returns the list of records where r[key] = value"""
        for key in kw:
            if not key in self.fields:
                raise ValueError,"Field %s not in the database" %key
        def sel_func(r):
            for key in kw:
                if not r[key] == kw[key]:
                    return False
            return True
        return [ r for r in self if sel_func(r) ]
    
    def __getitem__(self,record_id):
        """Direct access by record id"""
        return self.records[record_id]
    
    def __len__(self):
        return len(self.records)

    def __delitem__(self,record_id):
        """Delete by record id"""
        self.delete(self[record_id])
        
    def __iter__(self):
        """Iteration on the records"""
        return self.records.itervalues()

if __name__ == '__main__':
    # test on a 1000 record base
    import random
    import datetime
    names = ['pierre','claire','simon','camille','jean',
                 'florence','marie-anne']
    db = Base('PyDbLite_test')
    db.create('name','age','size','birth',mode="override")
    for i in range(1000):
        db.insert(name=unicode(random.choice(names)),
             age=random.randint(7,47),size=random.uniform(1.10,1.95),
             birth=datetime.date(1990,10,10))
    db.create_index('age')
    db.commit()

    print 'Record #20 :',db[20]
    print '\nRecords with age=30 :'
    for rec in db._age[30]:
        print '%-10s | %2s | %s' %(rec['name'],rec['age'],round(rec['size'],2))

    print "\nSame with __call__"
    for rec in db(age=30):
        print '%-10s | %2s | %s' %(rec['name'],rec['age'],round(rec['size'],2))
    print db._age[30] == db(age=30)

    db.insert(name=unicode(random.choice(names))) # missing fields
    print '\nNumber of records with 30 <= age < 33 :',
    print sum([1 for r in db if 33 > r['age'] >= 30])
    
    print db.delete([])

    d = db.delete([r for r in db if 32> r['age'] >= 30 and r['name']==u'pierre'])
    print "\nDeleting %s records with name == 'pierre' and 30 <= age < 32" %d
    print '\nAfter deleting records '
    for rec in db._age[30]:
        print '%-10s | %2s | %s' %(rec['name'],rec['age'],round(rec['size'],2))
    print '\n',sum([1 for r in db]),'records in the database'
    print '\nMake pierre uppercase for age > 27'
    for record in ([r for r in db if r['name']=='pierre' and r['age'] >27]) :
        db.update(record,name=u"Pierre")
    print len([r for r in db if r['name']==u'Pierre']),'Pierre'
    print len([r for r in db if r['name']==u'pierre']),'pierre'
    print len([r for r in db if r['name'] in [u'pierre',u'Pierre']]),'p/Pierre'
    print 'is unicode :',isinstance(db[20]['name'],unicode)
    db.commit()
    db.open()
    print '\nSame operation after commit + open'
    print len([r for r in db if r['name']==u'Pierre']),'Pierre'
    print len([r for r in db if r['name']==u'pierre']),'pierre'
    print len([r for r in db if r['name'] in [u'pierre',u'Pierre']]),'p/Pierre'
    print 'is unicode :',isinstance(db[20]['name'],unicode)
    
    print "\nDeleting record #20"
    del db[20]
    if not 20 in db:
        print "record 20 removed"

    print db[21]
    db.drop_field('name')
    print db[21]
    db.add_field('adate',datetime.date.today())
    print db[21]
    
    k = db._age.keys()[0]
    print "key",k
    print k in db._age
    db.delete(db._age[k])
    print db._age[k]
    print k in db._age

      

Despite its ridiculously small footprint, this module can be a good alternative when other database management systems would be overkill (say up to 5 Mbytes of data), and Python programmers should find the syntax very intuitive

Tags: database

10 comments

stas zytkiewicz 17 years, 5 months ago # | flag

Adding record field to an existing dbase. Great script, small and fast. But I was wondering if it was possible to add a field to the records after the dbase was created ? Perhaps you could explain to me the best way to adapt your strakell script. I don't have the need nor the knowledge to use a sqlite solution and I hope I could use your nice strakell script.

Thanks, Stas

Pierre Quentel (author) 17 years, 5 months ago # | flag

Methods added. Good idea, I have added two methods to modify the database structure, add_field() and drop_field()

stas zytkiewicz 17 years, 5 months ago # | flag

name clash when using a field called 'name' If using a field called 'name' and trying to create an index the script crashes as the 'name' attribute of Base is changed. I renamed Base.name into Bass._name to prevent this.

Pierre Quentel (author) 17 years, 5 months ago # | flag

Name conflicts. Right, thanks for the report. To avoid other name conflicts it's probably better to prepend an underscore to the Base instance attribute : db._age for the index on "age"

Pierre Quentel (author) 17 years, 4 months ago # | flag

New version. Version 1.9 fixes a serious bug with indices and adds some new features. I also changed the module name for the more explicit PyDbLite

Pierre Quentel (author) 17 years, 1 month ago # | flag

Version 2.0. This version fixes a bug and adds a key __version__ to all records to manage concurrency control : it is set to 0 for new records and incremented each time the record is updated

Pierre Quentel (author) 16 years, 7 months ago # | flag

Version 2.1. This version fixes a bug in the open() method

More important, a new syntax is introduced to select records :

db(key1=val1,key2=val2)

is a shortcut for

[ r for r in db if r[key1]==val1 and r[key2]==val2 ]

Fernando Jeronymo 14 years, 11 months ago # | flag

Adding multiple indexation:

I added this to your database so I could support multiple indexes (for example, I want to index by fields 'C', 'D' and 'E' so that when I iterate through the records I get 'C' in sequence, and within that 'D' is in order and within that 'E' is in order):

Class Index is changed to support the sorted ids (the iterator is changed):

def __iter__(self):
    #self.ids is a list with the correct record ids ordered by the index e.g. [0,4,1,2,3]
    return iter(self.ids)

Class Base is changed to add: self.index # the latest index we 'indexified' the database against self.attributes # all attributes created on create_index are saved on this hash and also saved on disk

New sort method is added to support multiple-column indexation:

def radix_join(self, *fields):
   """ will go through a list of fields that must be sorted altogether, doing a radix sort and using python adaptive sorting to take care of sorting individual lists """
   def fieldComp(x,y):
      if self.records[x][currField] > self.records[y][currField]: return 1
      elif self.records[x][currField] == self.records[y][currField]: return 0
      return -1 # x<y

   # convert the tuple to a list
   fs = list(fields)
   # now retrieve the ids array for the last column (e.g. indexing by B-C-D we need the sorted IDs for D (sorted relative to D's values))
   attr='_'+fs[-1]
   ids = self.attributes[attr].ids[:]
   for field in fs[:-1].__reversed__(): # Radix Sort : now we go through the previous columns, resorting them (using the IDs as given but the values from the records) 
      currField = field
      ids.sort(fieldComp)
   return ids
#!!end-def-radix_join+!!

Fernando Jeronymo 14 years, 11 months ago # | flag

And the new create_index method:

def create_index(self, *fields):
   for f in fields:
      if not f in self.fields:
         Config.log('Field [%s] does not belong to database fields: [%s]' % (f, self.fields))
         raise NameError, ('%s is not a field name' % (f))
      # initialize the indices (only if non-existent, to allow for re-using of an indice that had been created already)
      if f in self.indices: continue
      Config.log('Creating index [%s]'%f)
      attr = '_' + f
      self.indices[f] = {}
      ids = []
      keys = sets.Set()
      # use bisect to quickly insert the id in the list
      for _id, record in self.records.iteritems():
         value = record[f]             
         bisect.insort(self.indices[f].setdefault(value, []), _id)
         keys.add(value)
      # now retrieve the groupings, sorted by keys
      for key in sorted(keys): ids += self.indices[f][key]
      # create a new attribute of self, used to find the records by this index
      self.attributes[attr] = Index(self, f, ids)
      setattr(self, attr, self.attributes[attr])
      #print 'f[%s] [%s]'%(f, self.indices[f])
      Config.log('Finished creating index.')
   #!!+for=loop=end+!!

   multIndex = '_'.join(fields)
   attr = '_%s'%multIndex
   if self.attributes.has_key(attr): # does it already exist? use it then
      self.index = self.attributes[attr]
   else:
      # attribute doesn't exist, need to create and indexify it
      Config.log('Creating join index [%s]'%attr)
      self.attributes[attr] = Index(self, multIndex, self.radix_join(*fields))
      setattr(self, attr, self.attributes[attr])
      self.index = self.attributes[attr]
      Config.log('Finished creating index.')

   # store indexes on database
   self.commit()

   Config.log('Using attribute index [%s]'%multIndex)

   # return the indexes (single index or joined indexes)
   return self.index
#!!end-def-create_index+!!

FARIZ MURADOV 8 years, 6 months ago # | flag

great script. does anyone tested ? how much data can handle ? approx row count or table size ?

◄	Python recipes (4591)	►
◄	Pierre Quentel's recipes (10)	►

PyDbLite, a small in-memory database engine (Python recipe) by Pierre Quentel
ActiveState Code (http://code.activestate.com/recipes/496770/)

10 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

PyDbLite, a small in-memory database engine (Python recipe) by Pierre Quentel ActiveState Code (http://code.activestate.com/recipes/496770/)

10 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

PyDbLite, a small in-memory database engine (Python recipe) by Pierre Quentel
ActiveState Code (http://code.activestate.com/recipes/496770/)