def encode_for_xml(unicode_data, encoding='ascii'): """ Encode unicode_data for use as XML or HTML, with characters outside of the encoding converted to XML numeric character references. """ try: return unicode_data.encode(encoding, 'xmlcharrefreplace') except ValueError: # ValueError is raised if there are unencodable chars in the # data and the 'xmlcharrefreplace' error handler is not found. # Pre-2.3 Python doesn't support the 'xmlcharrefreplace' error # handler, so we'll emulate it. return _xmlcharref_encode(unicode_data, encoding) def _xmlcharref_encode(unicode_data, encoding): """Emulate Python 2.3's 'xmlcharrefreplace' encoding error handler.""" chars = [] # Step through the unicode_data string one character at a time in # order to catch unencodable characters: for char in unicode_data: try: chars.append(char.encode(encoding, 'strict')) except UnicodeError: chars.append('&#%i;' % ord(char)) return ''.join(chars) if __name__ == '__main__': # demo data = u'''\ <html> <head> <title>Encoding Test</title> </head> <body> <p>accented characters:</p> <ul> <li>\xe0 (a + grave) <li>\xe7 (c + cedilla) <li>\xe9 (e + acute) <li>\xee (i + circumflex) <li>\xf1 (n + tilde) <li>\xfc (u + umlaut) </ul> <p>symbols:</p> <ul> <li>\xa3 (British pound) <li>\xa2 (cent) <li>\u20ac (Euro) <li>\u221e (infinity) <li>\xb0 (degree) </ul> </body></html> ''' print encode_for_xml(data, 'ascii')