Extensions of html.parser.HTMLParser().
PrettyHTMLParser() does not splits data into chuncks by HTML entities. StatedHTMLParser() can have many state-dependent handlers which helps parsing HTML pages alot.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | """Module for parsing HTML pages."""
# htmlpars.py by Ádám Szieberth (2013)
# Python 3.3
# Full license text:
# --------------------------------------------------------------
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# Version 2, December 2004
#
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
#
# Everyone is permitted to copy and distribute verbatim or
# modified copiesof this license document, and changing it is
# allowed as long as the name is changed.
#
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND
# MODIFICATION
#
# 0. You just DO WHAT THE FUCK YOU WANT TO.
# --------------------------------------------------------------
from functools import partial, wraps
from html.parser import HTMLParser
MAIN_HANDLERS = {"handle_startendtag", "handle_starttag",
"handle_endtag", "handle_comment", "handle_decl",
"handle_pi"}
class PrettyHTMLParser(HTMLParser):
"""
This parser do not split up data in arbitrary chunks like
html.parser.HTMLParser, so you can more easily handle them.
Note that data is not handled by facing but immediately
before handling the following non-data part of the page.
Note that handle_charref() and handle_entityref() are
depreciated in PrettyHTMLParser. Do not override them!
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._reset_data()
def __getattribute__(self, name):
if name == "handle_data":
return super().__getattribute__("_handle_data")
elif name in MAIN_HANDLERS:
return self._handler(name)
else:
return super().__getattribute__(name)
def _handle_data(self, data):
if self._data_cont:
self._data_buf.append(data)
self._data_cont = False
else:
self._data_buf = [data]
def _handler(self, name):
sup = super()
def wrapper(*args, **kwargs):
if self._data_buf:
# Note: Calling super() here would result
# "SystemError: super(): no arguments"
# exception.
h_data = sup.__getattribute__("handle_data")
h_data("".join(self._data_buf))
self._reset_data()
return sup.__getattribute__(name)(*args, **kwargs)
return wrapper
def _reset_data(self):
"""
Resets the data buffer and the data continous flag.
"""
self._data_buf, self._data_cont = [], False
def handle_charref(self, name):
"""
Do not ovveride this method!
"""
data = self.unescape("&#{};".format(name))
self._data_buf.append(data)
self._data_cont = True
def handle_entityref(self, name):
"""
Do not ovveride this method!
"""
data = self.unescape("&{};".format(name))
self._data_buf.append(data)
self._data_cont = True
class StatedHTMLParser(PrettyHTMLParser):
"""
This HTML parser parent class uses a state variable to make
user able to do a more sophisticated parsing of HTML pages.
I suggest setting initial state and other instance variables
in __init__() method of the subclass.
Individual handlers should manage self.state. Handlers in
subclasses should be named keeping the following rule:
handle_<self_state>_<handler_type>
For example if self.state == "goals", then to handle data,
self.handle_goals_data() is called, to handle a startag,
self.handle_goals_starttag() is called, etc. When the
handler not exists, self.common_handler() is called.
"""
def __init__(self, *args, **kwargs):
self.state = ""
super().__init__(*args, **kwargs)
def common_handler(self, name, *args, **kwargs):
pass
def _stated_handler(self, name):
h, n = name.split("_", 1)
handler = "_".join((h, self.state, n))
try:
return super().__getattribute__(handler)
except AttributeError:
return partial(self.common_handler, handler)
def _handler(self, name):
def wrapper(*args, **kwargs):
if self._data_buf:
h_data = self._stated_handler("handle_data")
h_data("".join(self._data_buf))
self._reset_data()
return self._stated_handler(name)(*args, **kwargs)
return wrapper
def skips_empty_data(method):
"""
Decorator which allows data handlers to skip empty data.
"""
@wraps(method)
def wrapper(parser_instance, data):
if data.strip():
return method(parser_instance, data)
return wrapper
|
I faced with the same problem as the questioner here:
"I'm using a simple HTMLParser to parse a webpage with code that is always well-formed (it's automatically generated). It works well, until it hits a piece of data with an '&' sign in it - it seems to think that that makes it two separate pieces of data and processes them separately. (That is, it calls "handle_data" twice.) I at first thought that unescaping the '&' would solve the issue, but I don't think it does."