This module provides a function to summarize a XHTML string - that is, shorten it so it is has no more than a given amount of words while keeping XHTML (or, hopefully, any XML) tags intact.
No third-party packages are required (as of Python 2.4, but should work in earlier versions.)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161  | # -*- encoding: utf-8 -*-
import re
_tagopenre = re.compile(r'(?P<starws>\s?)<(?P<tagname>[^/][^> /]*)(?P<tagcontents>[^>]*)>(?P<endws>\s?)',re.MULTILINE)
_tagclosere = re.compile(r'(?P<startws>\s?)</(?P<tagname>[^>]+)>(?P<endws>\s?)')
_tagselfre = re.compile(r'(?P<startws>\s?)<(?P<tag>[^\d/>][^/>]*)/>(?P<endws>\s?)',re.MULTILINE)
_tagre = re.compile(r'<([^>]+)>', re.MULTILINE)
_selfclosere = re.compile('/\s*>\s*$')
_wsre = re.compile('\s+', re.MULTILINE)
_wsgtre = re.compile('\s+|>', re.MULTILINE)
def _find_ws(s, start=0, end=None):
    '''Find whitespace. Interface similar to str.find.'''
    if end is not None:
        s = s[start:end]
    else:
        s = s[start:]
    x = _wsre.search(s)
    if x:
        return x.start()+start
    else:
        return -1
def _find_ws_or_gt(s, start=0, end=None):
    '''Find whitespace or greater than ('>') sign. Interface similar to str.find.'''
    if end is not None:
        s = s[start:end]
    else:
        s = s[start:]
    x = _wsgtre.search(s)
    if x:
        return x.start()+start
    else:
        return -1
def summarize_html(html, maxwords = 25):
    if html is None:
        return ''
    tagopen = _tagopenre
    tagclose = _tagclosere
    tagre = _tagre
    tagself = _tagselfre
    tags = [0]
    taglist = []
    def tagopen_sub(match):
        tag = match.string[match.start():match.end()]
        taglist.append(tag)
        if _selfclosere.search(tag):
            r = '<%d/>'%tags[0]
        else:
            r = '<%d>'%tags[0]
        tags[0] += 1
        return r
    def tagclose_sub(match):
        r = '</%d>'%tags[0]
        taglist.append(match.string[match.start():match.end()])
        tags[0] += 1
        return r
    def tagself_sub(match):
        r = '<%d/>'%tags[0]
        taglist.append(match.string[match.start():match.end()])
        tags[0] += 1
        return r
    # preprocess text, fill taglist
    tagged = html.replace(' ', ' ')
    tagged = tagopen.sub(tagopen_sub, tagged)
    tagged = tagclose.sub(tagclose_sub, tagged)
    tagged = tagself.sub(tagself_sub, tagged)
    tagged = tagre.sub(r' <\1> ', tagged)
    # setup for processing
    splittags = tagged.split()
    alist = [None]*len(splittags)
    words = 0
    tagstack = []
    addspace = False
    do_not_add_dots = False
    is_table_row = 0    # misnamed: used to count nested table rows
    was_table_row = False
    for i,elem in enumerate(splittags):
        # modifying the list you're iterating is a crime, so we make ourselves
        # another one and fill it when needed
        alist[i] = elem
        # end condition
        if words >= maxwords and not is_table_row:
            # special case: tables
            if was_table_row:
                do_not_add_dots = True
            break
        # an usual word
        if not elem.startswith('<'):
            words += 1
            if addspace:
                alist[i] = ' '+elem
            addspace = True
        # a opening tag
        elif not elem.startswith('</') and not elem.endswith('/>'):
            tag = taglist[int(elem[1:-1])]
            tested = tag.strip()
            if tested[:3] == '<tr' and _find_ws(tested[:4]) in (-1, 3):
                is_table_row += 1
            # comment tags need not be closed
            if not tested.startswith('<!'):
                tagstack.append(tag)
            alist[i] = tag
            addspace = False
        # a closing tag
        elif not elem.endswith('/>'):
            addspace = False
            try:
                top = tagstack[-1]
            except IndexError:
                raise ValueError('tag not opened: %s'%elem)
            # extract the tagname from top of the stack
            top = top[top.find('<')+1:top.find('>')]
            cut = _find_ws(top)
            if cut > 0:
                top = top[:cut]
            # extract the tagname from the tag list
            fromlist = taglist[int(elem[2:-1])]
            tag = fromlist[fromlist.find('/')+1:fromlist.find('>')]
            if top != tag:
                raise ValueError('tag not closed properly: %s, got %s'%(top, tag))
            # special case: tables
            # some other tags could use special-casing, like dt/dd
            if top == 'tr':
                is_table_row -= 1
                was_table_row = True
                if is_table_row < 0:
                    is_table_row = 0
            else:
                was_table_row = False
            # close the tag
            tagstack.pop()
            alist[i] = fromlist
        # a self-closing tag
        else:
            tag = taglist[int(elem[1:-2])]
            alist[i] = tag
            addspace = False
    else:
        do_not_add_dots = True
        i += 1
    if words < maxwords:
        # normalize whitespace (actually not needed... makes the tests pass, though)
        return _wsre.sub(' ', html)
    # take care so no monstrousities like '......' appear at the end of output
    if alist[i-1][-1] in ('.', ',', ':', '?', '!'):
        alist[i-1] = re.sub(r'[\.,:?!]+$', '', alist[i-1])
    # close remaining open tags
    tagstack.reverse()
    if not do_not_add_dots:
        result = alist[:i] + ['...'] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
    elif was_table_row:
        result = alist[:i] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
    else:
        result = alist[:i] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
    # normalize whitespace...
    r = _wsre.sub(' ', ''.join(result))
    return r
 | 
This recipe was created to summarize html articles written in tinymce embedded in a django website. Approach taken is probably suboptimal, but didn't require an advanced, stateful lexer.
Known issues:
- There's a special case for tables, so rows don't end abruptly.
 - Comments are not handled correctly in all cases - whitespace before and/or after a comment may not be retained. Empty tags may also have whitespace issues.
 
Download
Copy to clipboard
This horrible expression, which appears 3 times:
match.string[match.start():match.end()]should be replaced with justmatch.group()