Welcome, guest | Sign In | My Account | Store | Cart

Extensions of html.parser.HTMLParser().

PrettyHTMLParser() does not splits data into chuncks by HTML entities. StatedHTMLParser() can have many state-dependent handlers which helps parsing HTML pages alot.

Python, 153 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""Module for parsing HTML pages."""

# htmlpars.py by Ádám Szieberth (2013)
# Python 3.3

# Full license text:
# --------------------------------------------------------------
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# Version 2, December 2004
#
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
#
# Everyone is permitted to copy and distribute verbatim or
# modified copiesof this license document, and changing it is
# allowed as long as the name is changed.
#
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND
# MODIFICATION
#
# 0. You just DO WHAT THE FUCK YOU WANT TO.
# --------------------------------------------------------------

from functools import partial, wraps
from html.parser import HTMLParser

MAIN_HANDLERS = {"handle_startendtag", "handle_starttag",
    "handle_endtag", "handle_comment", "handle_decl",
    "handle_pi"}

class PrettyHTMLParser(HTMLParser):
    """
    This parser do not split up data in arbitrary chunks like
    html.parser.HTMLParser, so you can more easily handle them.
    Note that data is not handled by facing but immediately
    before handling the following non-data part of the page.
    Note that handle_charref() and handle_entityref() are
    depreciated in PrettyHTMLParser. Do not override them!
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._reset_data()


    def __getattribute__(self, name):
        if name == "handle_data":
            return super().__getattribute__("_handle_data")
        elif name in MAIN_HANDLERS:
            return self._handler(name)
        else:
            return super().__getattribute__(name)


    def _handle_data(self, data):
        if self._data_cont:
            self._data_buf.append(data)
            self._data_cont = False
        else:
            self._data_buf = [data]


    def _handler(self, name):
        sup = super()
        def wrapper(*args, **kwargs):
            if self._data_buf:
                # Note: Calling super() here would result
                # "SystemError: super(): no arguments"
                # exception.
                h_data = sup.__getattribute__("handle_data")
                h_data("".join(self._data_buf))
                self._reset_data()
            return sup.__getattribute__(name)(*args, **kwargs)
        return wrapper


    def _reset_data(self):
        """
        Resets the data buffer and the data continous flag.
        """
        self._data_buf, self._data_cont = [], False


    def handle_charref(self, name):
        """
        Do not ovveride this method!
        """
        data = self.unescape("&#{};".format(name))
        self._data_buf.append(data)
        self._data_cont = True


    def handle_entityref(self, name):
        """
        Do not ovveride this method!
        """
        data = self.unescape("&{};".format(name))
        self._data_buf.append(data)
        self._data_cont = True

class StatedHTMLParser(PrettyHTMLParser):
    """
    This HTML parser parent class uses a state variable to make
    user able to do a more sophisticated parsing of HTML pages.

    I suggest setting initial state and other instance variables
    in __init__() method of the subclass.

    Individual handlers should manage self.state. Handlers in
    subclasses should be named keeping the following rule:
        handle_<self_state>_<handler_type>
    For example if self.state == "goals", then to handle data,
    self.handle_goals_data() is called, to handle a startag,
    self.handle_goals_starttag() is called, etc. When the
    handler not exists, self.common_handler() is called.
    """
    def __init__(self, *args, **kwargs):
        self.state = ""
        super().__init__(*args, **kwargs)


    def common_handler(self, name, *args, **kwargs):
        pass


    def _stated_handler(self, name):
        h, n = name.split("_", 1)
        handler = "_".join((h, self.state, n))
        try:
            return super().__getattribute__(handler)
        except AttributeError:
            return partial(self.common_handler, handler)


    def _handler(self, name):
        def wrapper(*args, **kwargs):
            if self._data_buf:
                h_data = self._stated_handler("handle_data")
                h_data("".join(self._data_buf))
                self._reset_data()
            return self._stated_handler(name)(*args, **kwargs)
        return wrapper


def skips_empty_data(method):
    """
    Decorator which allows data handlers to skip empty data.
    """
    @wraps(method)
    def wrapper(parser_instance, data):
        if data.strip():
            return method(parser_instance, data)
    return wrapper

I faced with the same problem as the questioner here:

"I'm using a simple HTMLParser to parse a webpage with code that is always well-formed (it's automatically generated). It works well, until it hits a piece of data with an '&' sign in it - it seems to think that that makes it two separate pieces of data and processes them separately. (That is, it calls "handle_data" twice.) I at first thought that unescaping the '&' would solve the issue, but I don't think it does."