import os import random from array import array def get_sampling_params(filename): '''redefine to fit data''' filesize = os.path.getsize(filename) num_of_chunks = 500 size_of_chunk = filesize/(100*num_of_chunks) seek_positions = [random.randint(0,(filesize-size_of_chunk)) for i in xrange(num_of_chunks)] epsilon = 2 return seek_positions, size_of_chunk, epsilon def sample_chunks_gen(filename, seek_positions, size_of_chunk): '''yields array of floats''' def _array_from_file_chunk(input_file, seek_position, size_of_chunk): input_file.seek(seek_position) buf_str = input_file.read(size_of_chunk) first_endl = buf_str.find('\n') last_endl = buf_str.rfind('\n') return array('f', map(float, buf_str[(first_endl+1):][:(last_endl+1)].splitlines())) with open(filename, 'rb') as input_file: for seek_position in seek_positions: yield _array_from_file_chunk(input_file, seek_position, size_of_chunk) def select(data, positions, start=0, end=None): '''For every n in *positions* find nth rank ordered element in *data* inplace select''' if not end: end = len(data) - 1 if end < start: return [] if end == start: return [data[start]] pivot_rand_i = random.randrange(start,end) pivot_rand = data[pivot_rand_i] # get random pivot data[end], data[pivot_rand_i] = data[pivot_rand_i], data[end] pivot_i = start for i in xrange(start, end): # partitioning about the pivot if data[i] < pivot_rand: data[pivot_i], data[i] = data[i], data[pivot_i] pivot_i += 1 data[end], data[pivot_i] = data[pivot_i], data[end] under_positions, over_positions, mid_positions = [],[],[] for position in positions: if position == pivot_i: mid_positions.append(position) elif position < pivot_i: under_positions.append(position) else: over_positions.append(position) result = [] if len(under_positions) > 0: result.extend(select(data, under_positions, start, pivot_i-1)) if len(mid_positions) > 0: result.extend([data[position] for position in mid_positions]) if len(over_positions) > 0: result.extend(select(data, over_positions, pivot_i+1, end)) return result def get_interval_endpoints(data, position, epsilon): '''return interval endpoints''' values = select(data, [position-epsilon, position+epsilon]) return min(values), max(values) def reduce_endpoints(endpoints): '''return min and max of all endpoints''' first_endpoint = min(endpoints, key=lambda x: x[0]) last_endpoint = max(endpoints, key=lambda x: x[1]) return first_endpoint[0], last_endpoint[1] def fill_interval(filename, endpoints, size_of_chunk=64*1024): '''return count of values before interval, interval, data len''' with open(filename, 'rb') as input_file: first_endpoint = endpoints[0] last_endpoint = endpoints[1] seek_position = 0 interval = array('f') iappend = interval.append pre_count = 0 count = 0 input_file.seek(seek_position) buf_str = input_file.read(size_of_chunk) while buf_str: last_endl = buf_str.rfind('\n') buf_arr = array('f', map(float, buf_str[:(last_endl+1)].splitlines())) count = count + len(buf_arr) for value in buf_arr: if value <= first_endpoint: pre_count = pre_count + 1 elif last_endpoint <= value: pass else: iappend(value) seek_position = seek_position + (last_endl+1) input_file.seek(seek_position) buf_str = input_file.read(size_of_chunk) return interval, count, pre_count def find_value(filename, position, sampling_func=get_sampling_params): '''find value in file''' seek_positions, size_of_chunk, epsilon = sampling_func(filename) s_chunks_iter = sample_chunks_gen(filename, seek_positions, size_of_chunk) s_endpoints = map(lambda x: get_interval_endpoints(x, position, epsilon), s_chunks_iter) endpoints = reduce_endpoints(s_endpoints) interval, count, pre_count = fill_interval(filename, endpoints) print len(interval) results = select(interval, [position - pre_count]) return results[0] if __name__ == '__main__': import optparse parser = optparse.OptionParser() parser.add_option( '-f','--file', dest='filename', ) parser.add_option( '-p','--position', dest='position', type='int' ) options, args = parser.parse_args() print repr(find_value(options.filename, options.position))