This cleanses user input of potentially dangerous HTML or scripting code that can be used to launch "cross-site scripting" ("XSS") attacks, or run other harmful or annoying code. You want to run this on any user-entered text that will be saved and retransmitted to other users of your web site. This uses only standard Python libraries.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | from htmllib import HTMLParser
from cgi import escape
from urlparse import urlparse
from formatter import AbstractFormatter
from htmlentitydefs import entitydefs
from xml.sax.saxutils import quoteattr
def xssescape(text):
"""Gets rid of < and > and & and, for good measure, :"""
return escape(text, quote=True).replace(':',':')
class XssCleaner(HTMLParser):
def __init__(self, fmt = AbstractFormatter):
HTMLParser.__init__(self, fmt)
self.result = ""
self.open_tags = []
# A list of the only tags allowed. Be careful adding to this. Adding
# "script," for example, would not be smart. 'img' is out by default
# because of the danger of IMG embedded commands, and/or web bugs.
self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i',
'li', 'ol', 'ul', 'p', 'cite']
# A list of tags that require no closing tag.
self.requires_no_close = ['img', 'br']
# A dictionary showing the only attributes allowed for particular tags.
# If a tag is not listed here, it is allowed no attributes. Adding
# "on" tags, like "onhover," would not be smart. Also be very careful
# of "background" and "style."
self.allowed_attributes = \
{'a':['href','title'],
'img':['src','alt'],
'blockquote':['type']}
# The only schemes allowed in URLs (for href and src attributes).
# Adding "javascript" or "vbscript" to this list would not be smart.
self.allowed_schemes = ['http','https','ftp']
def handle_data(self, data):
if data:
self.result += xssescape(data)
def handle_charref(self, ref):
if len(ref) < 7 and ref.isdigit():
self.result += '&#%s;' % ref
else:
self.result += xssescape('&#%s' % ref)
def handle_entityref(self, ref):
if ref in entitydefs:
self.result += '&%s;' % ref
else:
self.result += xssescape('&%s' % ref)
def handle_comment(self, comment):
if comment:
self.result += xssescape("<!--%s-->" % comment)
def handle_starttag(self, tag, method, attrs):
if tag not in self.permitted_tags:
self.result += xssescape("<%s>" % tag)
else:
bt = "<" + tag
if tag in self.allowed_attributes:
attrs = dict(attrs)
self.allowed_attributes_here = \
[x for x in self.allowed_attributes[tag] if x in attrs \
and len(attrs[x]) > 0]
for attribute in self.allowed_attributes_here:
if attribute in ['href', 'src', 'background']:
if self.url_is_acceptable(attrs[attribute]):
bt += ' %s="%s"' % (attribute, attrs[attribute])
else:
bt += ' %s=%s' % \
(xssescape(attribute), quoteattr(attrs[attribute]))
if bt == "<a" or bt == "<img":
return
if tag in self.requires_no_close:
bt += "/"
bt += ">"
self.result += bt
self.open_tags.insert(0, tag)
def handle_endtag(self, tag, attrs):
bracketed = "</%s>" % tag
if tag not in self.permitted_tags:
self.result += xssescape(bracketed)
elif tag in self.open_tags:
self.result += bracketed
self.open_tags.remove(tag)
def unknown_starttag(self, tag, attributes):
self.handle_starttag(tag, None, attributes)
def unknown_endtag(self, tag):
self.handle_endtag(tag, None)
def url_is_acceptable(self,url):
### Requires all URLs to be "absolute."
parsed = urlparse(url)
return parsed[0] in self.allowed_schemes and '.' in parsed[1]
def strip(self, rawstring):
"""Returns the argument stripped of potentially harmful HTML or Javascript code"""
self.result = ""
self.feed(rawstring)
for endtag in self.open_tags:
if endtag not in self.requires_no_close:
self.result += "</%s>" % endtag
return self.result
def xtags(self):
"""Returns a printable string informing the user which tags are allowed"""
self.permitted_tags.sort()
tg = ""
for x in self.permitted_tags:
tg += "<" + x
if x in self.allowed_attributes:
for y in self.allowed_attributes[x]:
tg += ' %s=""' % y
tg += "> "
return xssescape(tg.strip())
|
Ordinarily, cgi.escape() provides complete protection from harmful HTML and javascript, because it strips out all tags completely. However, sometimes you want to let the user use some HTML tags to include links, formatting, and so on. This code lets you allow your users to define a subset of HTML tags that you choose, but scrubs them carefully to make sure no harmful code comes through.
Usage:
>>> import xss
>>> x = xss.XssCleaner()
>>> test = """A good link should remain.
A bad link should be stripped.
You should not mangle this link.
Authorized tags are fine, but unauthorized tags are not.
An ugly link should be removed.
Ampersands Ordinarily, cgi.escape() provides complete protection from harmful HTML and javascript, because it strips out all tags completely. However, sometimes you want to let the user use some HTML tags to include links, formatting, and so on. This code lets you allow your users to define a subset of HTML tags that you choose, but scrubs them carefully to make sure no harmful code comes through.
Usage:
>>> import xss
>>> x = xss.XssCleaner()
>>> test = """A good link should remain.
A bad link should be stripped.
You should not mangle this link.
Authorized tags are fine, but unauthorized tags are not.
An ugly link should be removed.
Ampersands
Hi I am new to python. I was unable to unserstand, How to use this code snippet? Please give an example usage, much helpful..