Welcome, guest | Sign In | My Account | Store | Cart
'''
Remove diacritical marks from strings containing characters from any
latin alphabets.

Tested on both Python 2.x and Python 3.x
'''
import unicodedata

def remove_diacritic(input):
    '''
    Accept a unicode string, and return a normal string (bytes in Python 3)
    without any diacritical marks.
    '''
    return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore')

if __name__ == '__main__':
    import sys
    
    input = '\xc0 quelle \xe9cole va-tu?'

    if sys.hexversion >= 0x3000000:
        # On Python >= 3.0.0
        output = remove_diacritic(input).decode()
    else:
        # On Python < 3.0.0
        output = remove_diacritic(unicode(input, 'ISO-8859-1'))

    print(input)
    print(output)
    assert(output == 'A quelle ecole va-tu?')

History

  • revision 7 (15 years ago)
  • previous revisions are not available