1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59 | import sgmllib, string
class StrippingParser(sgmllib.SGMLParser):
# These are the HTML tags that we will leave intact
valid_tags = ('b', 'a', 'i', 'br', 'p')
from htmlentitydefs import entitydefs # replace entitydefs from sgmllib
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.result = ""
self.endTagList = []
def handle_data(self, data):
if data:
self.result = self.result + data
def handle_charref(self, name):
self.result = "%s&#%s;" % (self.result, name)
def handle_entityref(self, name):
if self.entitydefs.has_key(name):
x = ';'
else:
# this breaks unstandard entities that end with ';'
x = ''
self.result = "%s&%s%s" % (self.result, name, x)
def unknown_starttag(self, tag, attrs):
""" Delete all tags except for legal ones """
if tag in self.valid_tags:
self.result = self.result + '<' + tag
for k, v in attrs:
if string.lower(k[0:2]) != 'on' and string.lower(v[0:10]) != 'javascript':
self.result = '%s %s="%s"' % (self.result, k, v)
endTag = '</%s>' % tag
self.endTagList.insert(0,endTag)
self.result = self.result + '>'
def unknown_endtag(self, tag):
if tag in self.valid_tags:
self.result = "%s</%s>" % (self.result, tag)
remTag = '</%s>' % tag
self.endTagList.remove(remTag)
def cleanup(self):
""" Append missing closing tags """
for j in range(len(self.endTagList)):
self.result = self.result + self.endTagList[j]
def strip(s):
""" Strip illegal HTML tags from string s """
parser = StrippingParser()
parser.feed(s)
parser.close()
parser.cleanup()
return parser.result
|
Problems with XHTML. This seems to have trouble with XHTML-style self-closed elements, for instance br, hr and img. When those such elements are encountered, the stripping parser will output a single greater-than character.
Updated for XHTML. I'm not sure when HTMLParser.HTMLParser was introduced to the python library, but my guess is after Itamar wrotes this. You can use that instead of sgmllib.SGMLParser. Just change the import:
instead of
and change
to
You should be all set ;-)
Thanks! Thanks! Using HTMLParser instead of sgmllib does indeed fix that issue. (The __init__() method of StrippingParser needs to be changed, too, of course.)
Doesn't leave valid_tags elements anymore. The change over to HTMLParser.HTMLParser doesn't leave the elements specified in valid_tags -- it strips out all tags. The method unknown_starttag is not called at all.
I realize this post is ancient, but in case somebody stumbles across it in a Google search like I did -- it should be "handle_starttag" and "handle_endtag" instead of "unknown_"