Welcome, guest | Sign In | My Account | Store | Cart

This cleanses user input of potentially dangerous HTML or scripting code that can be used to launch "cross-site scripting" ("XSS") attacks, or run other harmful or annoying code. You want to run this on any user-entered text that will be saved and retransmitted to other users of your web site. This uses only standard Python libraries.

Python, 114 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from htmllib import HTMLParser
from cgi import escape
from urlparse import urlparse
from formatter import AbstractFormatter
from htmlentitydefs import entitydefs
from xml.sax.saxutils import quoteattr

def xssescape(text):
    """Gets rid of < and > and & and, for good measure, :"""
    return escape(text, quote=True).replace(':','&#58;')

class XssCleaner(HTMLParser):
    def __init__(self, fmt = AbstractFormatter):
        HTMLParser.__init__(self, fmt)
        self.result = ""
        self.open_tags = []
        # A list of the only tags allowed.  Be careful adding to this.  Adding
        # "script," for example, would not be smart.  'img' is out by default 
        # because of the danger of IMG embedded commands, and/or web bugs.
        self.permitted_tags = ['a', 'b', 'blockquote', 'br', 'i', 
                          'li', 'ol', 'ul', 'p', 'cite']

        # A list of tags that require no closing tag.
        self.requires_no_close = ['img', 'br']

        # A dictionary showing the only attributes allowed for particular tags.
        # If a tag is not listed here, it is allowed no attributes.  Adding
        # "on" tags, like "onhover," would not be smart.  Also be very careful
        # of "background" and "style."
        self.allowed_attributes = \
            {'a':['href','title'],
             'img':['src','alt'],
             'blockquote':['type']}

        # The only schemes allowed in URLs (for href and src attributes).
        # Adding "javascript" or "vbscript" to this list would not be smart.
        self.allowed_schemes = ['http','https','ftp']
    def handle_data(self, data):
        if data:
            self.result += xssescape(data)
    def handle_charref(self, ref):
        if len(ref) < 7 and ref.isdigit():
            self.result += '&#%s;' % ref
        else:
            self.result += xssescape('&#%s' % ref)
    def handle_entityref(self, ref):
        if ref in entitydefs:
            self.result += '&%s;' % ref
        else:
            self.result += xssescape('&%s' % ref)
    def handle_comment(self, comment):
        if comment:
            self.result += xssescape("<!--%s-->" % comment)

    def handle_starttag(self, tag, method, attrs):
        if tag not in self.permitted_tags:
            self.result += xssescape("<%s>" %  tag)
        else:
            bt = "<" + tag
            if tag in self.allowed_attributes:
                attrs = dict(attrs)
                self.allowed_attributes_here = \
                  [x for x in self.allowed_attributes[tag] if x in attrs \
                   and len(attrs[x]) > 0]
                for attribute in self.allowed_attributes_here:
                    if attribute in ['href', 'src', 'background']:
                        if self.url_is_acceptable(attrs[attribute]):
                            bt += ' %s="%s"' % (attribute, attrs[attribute])
                    else:
                        bt += ' %s=%s' % \
                           (xssescape(attribute), quoteattr(attrs[attribute]))
            if bt == "<a" or bt == "<img":
                return
            if tag in self.requires_no_close:
                bt += "/"
            bt += ">"                     
            self.result += bt
            self.open_tags.insert(0, tag)
            
    def handle_endtag(self, tag, attrs):
        bracketed = "</%s>" % tag
        if tag not in self.permitted_tags:
            self.result += xssescape(bracketed)
        elif tag in self.open_tags:
            self.result += bracketed
            self.open_tags.remove(tag)
            
    def unknown_starttag(self, tag, attributes):
        self.handle_starttag(tag, None, attributes)
    def unknown_endtag(self, tag):
        self.handle_endtag(tag, None)
    def url_is_acceptable(self,url):
        ### Requires all URLs to be "absolute."
        parsed = urlparse(url)
        return parsed[0] in self.allowed_schemes and '.' in parsed[1]
    def strip(self, rawstring):
        """Returns the argument stripped of potentially harmful HTML or Javascript code"""
        self.result = ""
        self.feed(rawstring)
        for endtag in self.open_tags:
            if endtag not in self.requires_no_close:
                self.result += "</%s>" % endtag
        return self.result
    def xtags(self):
        """Returns a printable string informing the user which tags are allowed"""
        self.permitted_tags.sort()
        tg = ""
        for x in self.permitted_tags:
            tg += "<" + x
            if x in self.allowed_attributes:
                for y in self.allowed_attributes[x]:
                    tg += ' %s=""' % y
            tg += "> "
        return xssescape(tg.strip())

Ordinarily, cgi.escape() provides complete protection from harmful HTML and javascript, because it strips out all tags completely. However, sometimes you want to let the user use some HTML tags to include links, formatting, and so on. This code lets you allow your users to define a subset of HTML tags that you choose, but scrubs them carefully to make sure no harmful code comes through.

Usage:

>>> import xss
>>> x = xss.XssCleaner()
>>> test = """A good link should remain.
A bad link should be stripped.
You should not mangle this link.
Authorized tags are fine, but unauthorized tags are not.
An ugly link  should be removed.
Ampersands Ordinarily, cgi.escape() provides complete protection from harmful HTML and javascript, because it strips out all tags completely.  However, sometimes you want to let the user use some HTML tags to include links, formatting, and so on.  This code lets you allow your users to define a subset of HTML tags that you choose, but scrubs them carefully to make sure no harmful code comes through.

Usage:

>>> import xss
>>> x = xss.XssCleaner()
>>> test = """A good link should remain.
A bad link should be stripped.
You should not mangle this link.
Authorized tags are fine, but unauthorized tags are not.
An ugly link  should be removed.
Ampersands

1 comment

DP 9 years, 11 months ago  # | flag

Hi I am new to python. I was unable to unserstand, How to use this code snippet? Please give an example usage, much helpful..