Welcome, guest | Sign In | My Account | Store | Cart

Recipe 303668 revision 2

def encode_for_xml(unicode_data, encoding='ascii'):
    """
    Encode unicode_data for use as XML or HTML, with characters outside
    of the encoding converted to XML numeric character references.
    """
    try:
        return unicode_data.encode(encoding, 'xmlcharrefreplace')
    except ValueError:
        # ValueError is raised if there are unencodable chars in the
        # data and the 'xmlcharrefreplace' error handler is not found.
        # Pre-2.3 Python doesn't support the 'xmlcharrefreplace' error
        # handler, so we'll emulate it.
        return _xmlcharref_encode(unicode_data, encoding)

def _xmlcharref_encode(unicode_data, encoding):
    """Emulate Python 2.3's 'xmlcharrefreplace' encoding error handler."""
    chars = []
    # Step through the unicode_data string one character at a time in
    # order to catch unencodable characters:
    for char in unicode_data:
        try:
            chars.append(char.encode(encoding, 'strict'))
        except UnicodeError:
            chars.append('&#%i;' % ord(char))
    return ''.join(chars)


if __name__ == '__main__':
    # demo
    data = u'''\
<html>
<head>
<title>Encoding Test</title>
</head>
<body>
<p>accented characters:</p>
<ul>
<li>\xe0 (a + grave)
<li>\xe7 (c + cedilla)
<li>\xe9 (e + acute)
<li>\xee (i + circumflex)
<li>\xf1 (n + tilde)
<li>\xfc (u + umlaut)
</ul>
<p>symbols:</p>
<ul>
<li>\xa3 (British pound)
<li>\xa2 (cent)
<li>\u20ac (Euro)
<li>\u221e (infinity)
<li>\xb0 (degree)
</ul>
</body></html>
'''
    print encode_for_xml(data, 'ascii')

« Back to Recipe 303668

History

revision 2 (19 years ago)
previous revisions are not available

Accounts

Code Recipes

Feedback & Information

ActiveState

© 2024 ActiveState Software Inc. All rights reserved. ActiveState®, Komodo®, ActiveState Perl Dev Kit®, ActiveState Tcl Dev Kit®, ActivePerl®, ActivePython®, and ActiveTcl® are registered trademarks of ActiveState. All other marks are property of their respective owners.