This recipes describe how to simplify the process of extracting data from Microsoft chm file format. It use pychm library http://gnochm.sourceforge.net/pychm.html.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | #!/usr/bin/env python
from chm.chm import CHMFile
from os.path import basename, exists, abspath
from HTMLParser import HTMLParser
from sys import argv, exit, stderr
import re
class LinksLocator(HTMLParser):
"""
LinksLocator is a class for retrieve name and path (Name and Local)
from TopicsTree in chm (compresed html) archive file or simple
html links
"""
def __init__(self):
HTMLParser.__init__(self)
self.in_obj = False
self.nodes = []
self.in_a = False
self.links = []
def handle_starttag(self, tag, attr):
if tag == 'object':
self.in_obj = True
self.new_node = {}
elif tag == 'param' and self.in_obj:
attr = dict(attr)
name = attr['name']
if name in ('Name', 'Local'):
self.new_node[name] = attr['value']
elif tag == 'a':
attr = dict(attr)
self.in_a = True
self.lnk = {'Local': attr.get('href')}
self.data = ''
def handle_endtag(self, tag):
if tag == 'object':
self.in_obj = False
if self.new_node != {}:
self.nodes.append(self.new_node)
elif tag == 'a':
self.in_a = False
# if link has an adress
if self.lnk.get('Local'):
self.lnk['Name'] = self.data
self.links.append(self.lnk)
def handle_data(self, data):
if self.in_a:
self.data += data
class ChmFileException(Exception): pass
class SimpleChmFile(CHMFile):
"""
SimpleChmFile is a wraper over CHMFile in witch you can iterate over
pages eg.:
>>> chm = SimpleChmFile('file.chm')
>>> for page in chm:
... print page
the output will be html content of compresed chm file
"""
def __init__(self, filename=None):
CHMFile.__init__(self)
self.nodes = []
if filename:
self.open(filename)
def __iter__(self):
"""return generator over pages in Content Tree."""
for node in self.nodes:
yield self._get_contents(node['Local'])
def open(self, filename):
if CHMFile.LoadCHM(self, filename) != 1:
raise IOError, "Can't load File '%s'" % filename
self.nodes = self._get_nodes()
if not self.nodes:
raise ChmFileException, "Can't find Content Tree"
def _get_contents(self, path):
"""return html contents of file `path' in chm archive."""
obj = CHMFile.ResolveObject(self, path)
if obj[0] != 0:
return None
html = CHMFile.RetrieveObject(self, obj[1])
return html[1]
def _get_nodes(self):
"""return list of dictionaries with data extracted from TopicsTree."""
parser = LinksLocator()
home_dir = self.home[:self.home.rfind('/')+1]
tree = CHMFile.GetTopicsTree(self)
if tree:
parser.feed(tree)
nodes = parser.nodes
else:
# try to locate Table of Contents
obj = self._get_contents(self.home)
if not obj:
raise ChmFileException, "Can't find Content Tree"
parser.feed(obj)
# sometimes the first page of archive contains link to its
# Content Tree
regx = re.compile('Content|toc', re.IGNORECASE)
for obj in parser.links:
local, name = obj['Local'], obj['Name']
if regx.search(local) or regx.search(name):
obj = self._get_contents(home_dir + local)
parser.feed(obj)
break
nodes = parser.links
parser.close()
# fix absolute path if nessesery
for obj in nodes:
if obj['Local'][0] != '/':
obj['Local'] = home_dir + obj['Local']
return nodes
def usage():
"""print usage on stderr."""
filename = basename(argv[0])
# don't brake unix pipe, send usege to stderr
stderr.write('usage:\n\t%s <chm filename>\n\non Unix you can use pipe to c'
'onvert chm to ascii\n\t%s foo.chm | lynx -dump -stdin > foo.'
'txt\nor\n\t%s foo.chm | html2text -style pretty | foo.txt\ny'
'ou can also save the output as commepresed gzip file\n\t%s f'
'oo.chm | html2text -style pretty | gzip | foo.gz\nand read i'
't with zless:\n\tzless foo.gz\n' % ((filename,)* 4))
def main():
try:
if len(argv) == 2:
chm = SimpleChmFile(argv[1])
for page in chm:
print page
else:
usage()
except (ChmFileException, IOError), e:
print >> stderr, "%s\n" % e
usage()
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()
|
Class SimpleChmFile simplify the process of extracting contents from Microsoft chm file format. When use as library:
>>> from catchm import SimpleChmFile
>>> chm = SimpleChmFile('spam.chm')
>>> for page in chm:
... print page
On Unix/Linux machines above script could be used to create gzip-ed text files:
$ catchm.py spam.chm | html2text -style pretty | gzip > spam.gz
assume that it's save under name 'catchm.py', and could be read with zless
$ zless spam.gz
In this recipe I've used class derived from HTMLParser to extract list of file names from Topics Tree. When there is no 'Topics Tree' (only files created with commercial Software has it) it try to locate simple html links, and extract they href attribute. SimpleChmFile class try to locate 'Topics Tree' or Table of contents from first page and using HTMLParser retrieve list of names. It defines __iter__ magic function which yield data extracted from compressed chm file.