Welcome, guest | Sign In | My Account | Store | Cart

This recipes describe how to simplify the process of extracting data from Microsoft chm file format. It use pychm library http://gnochm.sourceforge.net/pychm.html.

Python, 150 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python

from chm.chm import CHMFile
from os.path import basename, exists, abspath
from HTMLParser import HTMLParser
from sys import argv, exit, stderr
import re

class LinksLocator(HTMLParser):
    """
    LinksLocator is a class for retrieve name and path (Name and Local)
    from TopicsTree in chm (compresed html) archive file or simple
    html links
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_obj = False
        self.nodes = []
        self.in_a = False
        self.links = []

    def handle_starttag(self, tag, attr):
        if tag == 'object':
            self.in_obj = True
            self.new_node = {}
        elif tag == 'param' and self.in_obj:
            attr = dict(attr)
            name = attr['name']
            if name in ('Name', 'Local'):
                self.new_node[name] = attr['value']
        elif tag == 'a':
            attr = dict(attr)
            self.in_a = True
            self.lnk = {'Local': attr.get('href')}
            self.data = ''

    def handle_endtag(self, tag):
        if tag == 'object':
            self.in_obj = False
            if self.new_node != {}:
                self.nodes.append(self.new_node)
        elif tag == 'a':
            self.in_a = False
            # if link has an adress
            if self.lnk.get('Local'):
                self.lnk['Name'] = self.data
                self.links.append(self.lnk)
    def handle_data(self, data):
        if self.in_a:
            self.data += data

class ChmFileException(Exception): pass

class SimpleChmFile(CHMFile):
    """
    SimpleChmFile is a wraper over CHMFile in witch you can iterate over
    pages eg.:

    >>> chm = SimpleChmFile('file.chm')
    >>> for page in chm:
    ...     print page

    the output will be html content of compresed chm file
    """
    def __init__(self, filename=None):
        CHMFile.__init__(self)
        self.nodes = []
        if filename:
            self.open(filename)

    def __iter__(self):
        """return generator over pages in Content Tree."""
        for node in self.nodes:
            yield self._get_contents(node['Local'])

    def open(self, filename):
        if CHMFile.LoadCHM(self, filename) != 1:
            raise IOError, "Can't load File '%s'" % filename
        self.nodes = self._get_nodes()
        if not self.nodes:
            raise ChmFileException, "Can't find Content Tree"

    def _get_contents(self, path):
        """return html contents of file `path' in chm archive."""
        obj = CHMFile.ResolveObject(self, path)
        if obj[0] != 0:
            return None
        html = CHMFile.RetrieveObject(self, obj[1])
        return html[1]

    def _get_nodes(self):
        """return list of dictionaries with data extracted from TopicsTree."""
        parser = LinksLocator()
        home_dir = self.home[:self.home.rfind('/')+1]
        tree = CHMFile.GetTopicsTree(self)
        if tree:
            parser.feed(tree)
            nodes = parser.nodes
        else:
            # try to locate Table of Contents
            obj = self._get_contents(self.home)
            if not obj:
                raise ChmFileException, "Can't find Content Tree"
            parser.feed(obj)
            # sometimes the first page of archive contains link to its
            # Content Tree
            regx = re.compile('Content|toc', re.IGNORECASE)
            for obj in parser.links:
                local, name = obj['Local'], obj['Name']
                if regx.search(local) or regx.search(name):
                    obj = self._get_contents(home_dir + local)
                    parser.feed(obj)
                    break
            nodes = parser.links
        parser.close()
        # fix absolute path if nessesery
        for obj in nodes:
            if obj['Local'][0] != '/':
                obj['Local'] = home_dir + obj['Local']
        return nodes


def usage():
    """print usage on stderr."""
    filename = basename(argv[0])
    # don't brake unix pipe, send usege to stderr
    stderr.write('usage:\n\t%s <chm filename>\n\non Unix you can use pipe to c'
                 'onvert chm to ascii\n\t%s foo.chm | lynx -dump -stdin > foo.'
                 'txt\nor\n\t%s foo.chm | html2text -style pretty | foo.txt\ny'
                 'ou can also save the output as commepresed gzip file\n\t%s f'
                 'oo.chm | html2text -style pretty | gzip | foo.gz\nand read i'
                 't with zless:\n\tzless foo.gz\n' % ((filename,)* 4))

def main():
    try:
        if len(argv) == 2:
            chm = SimpleChmFile(argv[1])
            for page in chm:
                print page
        else:
            usage()
    except (ChmFileException, IOError), e:
        print >> stderr, "%s\n" % e
        usage()
    except KeyboardInterrupt:
        pass


if __name__ == '__main__':
    main()

Class SimpleChmFile simplify the process of extracting contents from Microsoft chm file format. When use as library:

>>> from catchm import SimpleChmFile
>>> chm = SimpleChmFile('spam.chm')
>>> for page in chm:
...     print page

On Unix/Linux machines above script could be used to create gzip-ed text files:

$ catchm.py spam.chm | html2text -style pretty | gzip > spam.gz

assume that it's save under name 'catchm.py', and could be read with zless

$ zless spam.gz

In this recipe I've used class derived from HTMLParser to extract list of file names from Topics Tree. When there is no 'Topics Tree' (only files created with commercial Software has it) it try to locate simple html links, and extract they href attribute. SimpleChmFile class try to locate 'Topics Tree' or Table of contents from first page and using HTMLParser retrieve list of names. It defines __iter__ magic function which yield data extracted from compressed chm file.