This is a way of doing calculations on whole arrays that might be to big to fit into memory.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | #/usr/bin/env python
import numpy as np
import sys
#-----------------------------------------------------------------------------#
def attrs_from_locals(instance):
"""
Stolen and shortened from http://code.activestate.com/recipes/286185/
Sets every local variable mentioned so far as an instance attribute.
"""
for key, value in sys._getframe(1).f_locals.items():
if key != 'self':
setattr(instance, key, value)
#-----------------------------------------------------------------------------#
class Hard_calc(object):
def __init__(self, filepath, m_shape, m_dtype='float64',
chunk_size=512*1024**2):
"""
Use this for calculations on huge arrays that won't fit into memory.
It makes use of the numpy.memmap-class and does calculations
chunk-wise. "chunk_size" determines how much Bytes of the memmap-array
are hold in memory.
"""
m_array = np.memmap(filepath, mode='w+', dtype=m_dtype, shape=m_shape)
row_size = np.prod(m_array.shape[1:]) * m_array.dtype.itemsize
step_length = \
int(chunk_size / row_size) if chunk_size > row_size else 1
attrs_from_locals(self)
#-------------------------------------------------------------------------#
@staticmethod
def operation():
raise AttributeError, \
"Append your own operation-method, otherwise I have nothing to do."
#-------------------------------------------------------------------------#
def __call__(self, *o_args, **o_kwds):
"""
Applies "operation". Make sure that method is present.
Called without "o_args" the memmap-array is used as a parameter for
"operation".
"""
o_args = [self.m_array] if len(o_args) == 0 else o_args
ii = 0
while ii != None:
ii_new = ii + self.step_length if \
ii + self.step_length < len(self.m_array) else None
sliced_o_args = [o_arg[ii:ii_new] for o_arg in o_args]
self.m_array[ii:ii_new] = self.operation(*sliced_o_args, **o_kwds)
del self.m_array # writes to file and frees memory
self.m_array = np.memmap(self.filepath, mode='r+',
dtype=self.m_dtype, shape=self.m_shape)
ii = ii_new
return self.m_array
#-----------------------------------------------------------------------------#
if __name__ == "__main__":
"""
The generation of sample-data here is not very fitting: The complete data
is allready in the memory to begin with. Imagine you opened a netCDF-,
grib- or memmap-file and u and v are just references to data stored on the
harddisk.
"""
u = np.random.rand(100, 100, 100)
v = np.random.rand(100, 100, 100)
Energy = Hard_calc("energy", u.shape, m_dtype='float', chunk_size=1024**2)
Energy.operation = lambda u, v: u**2 + v**2
wind_energy = Energy(u, v)
Energy.operation = np.sqrt
horizontal_velocity = Energy()
|
I used memmap-arrays because they can be created with a specific shape and the memory corresponding to this shape is not allocated on initialization. I would be very interested if there is a different/simpler/better datatype or way to tackle the problem of calculations on huge datasets.
http://docs.scipy.org/doc/numpy/reference/generated/numpy.memmap.html