To split a textual table automatically. Improved from a comp.lang.python request.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | import re, sys
#import collections # For Python 2.5
def tabler(data, rjust=False):
"""tabler(data, rjust=False): given a string containing a textual table, splits it
into a list of lists, according to the position of the most frequent word beginnings
(or according to the most frequent word endings for tables aligned to the right).
If the columns of the table are aligned to the right, then use rjust=True."""
# Requires re module
# RE to find the beginning of words
tpatt = re.compile(r"(?:[ ]|^)[^ ]")
# Remove empty lines
lines = [line for line in data.splitlines() if line.strip()]
if not lines:
return [[]]
# if the table is right justified, invert the lines
if rjust:
# to append spaces to the right, so when inverted they align vertically
len_max = max(len(line) for line in lines)
# Invert all the lines after appending the spaces
lines = [line.ljust(len_max)[::-1] for line in lines]
# Find the positions of all word beginnings
# This finds: treshs = [0, 11, 25, 35, 49, ...
# 44544 ipod apple black 102
# ^ ^ ^ ^ ^
treshs = [ob.start() for li in lines for ob in tpatt.finditer(li)]
# Find treshs frequences, old compatibility version
freqs = {}
for el in treshs:
if el in freqs:
freqs[el] += 1
else:
freqs[el] = 1
# Find treshs frequences, alternative for Python V.2.5+
# freqs = collections.defaultdict(int)
# for el in treshs:
# freqs[el] += 1
# Find a big enough frequence
bigf = max(freqs.itervalues()) * 0.6
# Find the most common column beginnings
cols = sorted(k for k,v in freqs.iteritems() if v>bigf)
def xpairs(alist):
"xpairs(xrange(n)) ==> (0,1), (1,2), (2,3), ..., (n-2, n-1)"
for i in xrange(len(alist)-1):
yield alist[i:i+2]
result = [[li[x:y].strip() for x,y in xpairs(cols+[None])] for li in lines]
# if the table is right justified, invert the lines
if rjust:
result = [[el[::-1] for el in reversed(line)] for line in result]
return result
if __name__ == '__main__': # Some demos
from pprint import pprint
data1 = """\
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
45fjk do not know + is less biac
disk seagate 250GB 130
5G_gff tbd tbd
gjgh88hgg media record a and b 12
hjj foo bar hop zip
hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj
qdsd zert nope nope
"""
print data1, "\n"
pprint(tabler(data1))
print
data2 = """\
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
45fjk do not know + is less biac
disk seagate 250GB 130
5G_gff tbd tbd
gjgh88hgg media record a and b 12
hjj foo bar hop zip
hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj
qdsd zert nope nope
"""
print data2, "\n"
pprint(tabler(data2, rjust=True))
print
data3 = """\
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
"""
print data3
pprint(tabler(data3))
print
data4 = """
"""
print data4
pprint(tabler(data4))
print
data5 = """\
A B C D E F G H I K L M N P Q R S T V W X Y Z *
A 4 -2 0 -2 -1 -2 0 -2 -1 -1 -1 -1 -2 -1 -1 -1 1 0 0 -3 -1 -2 -1 -4
B -2 6 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -1 -3 2 -4
C 0 -3 9 -3 -4 -2 -3 -3 -1 -3 -1 -1 -3 -3 -3 -3 -1 -1 -1 -2 -1 -2 -4 -4
D -2 6 -3 6 2 -3 -1 -1 -3 -1 -4 -3 1 -1 0 -2 0 -1 -3 -4 -1 -3 2 -4
E -1 2 -4 2 5 -3 -2 0 -3 1 -3 -2 0 -1 2 0 0 -1 -2 -3 -1 -2 5 -4
"""
print data5
for line in tabler(data5, rjust=True):
print " ".join(line)
|
This code isn't much tested, and surely there are many real situations where its euristic doesn't work. Note that you need a nonproportional font to see the column aligmments in the examples.
Tags: text