Welcome, guest | Sign In | My Account | Store | Cart
"""Module for parsing HTML pages."""

# htmlpars.py by Ádám Szieberth (2013)
# Python 3.3

# Full license text:
# --------------------------------------------------------------
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# Version 2, December 2004
#
# Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
#
# Everyone is permitted to copy and distribute verbatim or
# modified copiesof this license document, and changing it is
# allowed as long as the name is changed.
#
# DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
# TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND
# MODIFICATION
#
# 0. You just DO WHAT THE FUCK YOU WANT TO.
# --------------------------------------------------------------

from functools import partial, wraps
from html.parser import HTMLParser

MAIN_HANDLERS = {"handle_startendtag", "handle_starttag",
    "handle_endtag", "handle_comment", "handle_decl",
    "handle_pi"}

class PrettyHTMLParser(HTMLParser):
    """
    This parser do not split up data in arbitrary chunks like
    html.parser.HTMLParser, so you can more easily handle them.
    Note that data is not handled by facing but immediately
    before handling the following non-data part of the page.
    Note that handle_charref() and handle_entityref() are
    depreciated in PrettyHTMLParser. Do not override them!
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._reset_data()


    def __getattribute__(self, name):
        if name == "handle_data":
            return super().__getattribute__("_handle_data")
        elif name in MAIN_HANDLERS:
            return self._handler(name)
        else:
            return super().__getattribute__(name)


    def _handle_data(self, data):
        if self._data_cont:
            self._data_buf.append(data)
            self._data_cont = False
        else:
            self._data_buf = [data]


    def _handler(self, name):
        sup = super()
        def wrapper(*args, **kwargs):
            if self._data_buf:
                # Note: Calling super() here would result
                # "SystemError: super(): no arguments"
                # exception.
                h_data = sup.__getattribute__("handle_data")
                h_data("".join(self._data_buf))
                self._reset_data()
            return sup.__getattribute__(name)(*args, **kwargs)
        return wrapper


    def _reset_data(self):
        """
        Resets the data buffer and the data continous flag.
        """
        self._data_buf, self._data_cont = [], False


    def handle_charref(self, name):
        """
        Do not ovveride this method!
        """
        data = self.unescape("&#{};".format(name))
        self._data_buf.append(data)
        self._data_cont = True


    def handle_entityref(self, name):
        """
        Do not ovveride this method!
        """
        data = self.unescape("&{};".format(name))
        self._data_buf.append(data)
        self._data_cont = True

class StatedHTMLParser(PrettyHTMLParser):
    """
    This HTML parser parent class uses a state variable to make
    user able to do a more sophisticated parsing of HTML pages.

    I suggest setting initial state and other instance variables
    in __init__() method of the subclass.

    Individual handlers should manage self.state. Handlers in
    subclasses should be named keeping the following rule:
        handle_<self_state>_<handler_type>
    For example if self.state == "goals", then to handle data,
    self.handle_goals_data() is called, to handle a startag,
    self.handle_goals_starttag() is called, etc. When the
    handler not exists, self.common_handler() is called.
    """
    def __init__(self, *args, **kwargs):
        self.state = ""
        super().__init__(*args, **kwargs)


    def common_handler(self, name, *args, **kwargs):
        pass


    def _stated_handler(self, name):
        h, n = name.split("_", 1)
        handler = "_".join((h, self.state, n))
        try:
            return super().__getattribute__(handler)
        except AttributeError:
            return partial(self.common_handler, handler)


    def _handler(self, name):
        def wrapper(*args, **kwargs):
            if self._data_buf:
                h_data = self._stated_handler("handle_data")
                h_data("".join(self._data_buf))
                self._reset_data()
            return self._stated_handler(name)(*args, **kwargs)
        return wrapper


def skips_empty_data(method):
    """
    Decorator which allows data handlers to skip empty data.
    """
    @wraps(method)
    def wrapper(parser_instance, data):
        if data.strip():
            return method(parser_instance, data)
    return wrapper

Diff to Previous Revision

--- revision 1 2013-12-13 22:11:49
+++ revision 2 2013-12-14 00:28:36
@@ -54,10 +54,10 @@
 
     def _handle_data(self, data):
         if self._data_cont:
-            self._data_buf += data
+            self._data_buf.append(data)
             self._data_cont = False
         else:
-            self._data_buf = data
+            self._data_buf = [data]
 
 
     def _handler(self, name):
@@ -68,7 +68,7 @@
                 # "SystemError: super(): no arguments"
                 # exception.
                 h_data = sup.__getattribute__("handle_data")
-                h_data(self._data_buf)
+                h_data("".join(self._data_buf))
                 self._reset_data()
             return sup.__getattribute__(name)(*args, **kwargs)
         return wrapper
@@ -78,14 +78,15 @@
         """
         Resets the data buffer and the data continous flag.
         """
-        self._data_buf, self._data_cont = "", False
+        self._data_buf, self._data_cont = [], False
 
 
     def handle_charref(self, name):
         """
         Do not ovveride this method!
         """
-        self._data_buf += self.unescape("&#{};".format(name))
+        data = self.unescape("&#{};".format(name))
+        self._data_buf.append(data)
         self._data_cont = True
 
 
@@ -93,7 +94,8 @@
         """
         Do not ovveride this method!
         """
-        self._data_buf += self.unescape("&{};".format(name))
+        data = self.unescape("&{};".format(name))
+        self._data_buf.append(data)
         self._data_cont = True
 
 class StatedHTMLParser(PrettyHTMLParser):
@@ -134,7 +136,7 @@
         def wrapper(*args, **kwargs):
             if self._data_buf:
                 h_data = self._stated_handler("handle_data")
-                h_data(self._data_buf)
+                h_data("".join(self._data_buf))
                 self._reset_data()
             return self._stated_handler(name)(*args, **kwargs)
         return wrapper

History