Welcome, guest | Sign In | My Account | Store | Cart

This module provides a function to summarize a XHTML string - that is, shorten it so it is has no more than a given amount of words while keeping XHTML (or, hopefully, any XML) tags intact.

No third-party packages are required (as of Python 2.4, but should work in earlier versions.)

Python, 161 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- encoding: utf-8 -*-
import re

_tagopenre = re.compile(r'(?P<starws>\s?)<(?P<tagname>[^/][^> /]*)(?P<tagcontents>[^>]*)>(?P<endws>\s?)',re.MULTILINE)
_tagclosere = re.compile(r'(?P<startws>\s?)</(?P<tagname>[^>]+)>(?P<endws>\s?)')
_tagselfre = re.compile(r'(?P<startws>\s?)<(?P<tag>[^\d/>][^/>]*)/>(?P<endws>\s?)',re.MULTILINE)
_tagre = re.compile(r'<([^>]+)>', re.MULTILINE)
_selfclosere = re.compile('/\s*>\s*$')
_wsre = re.compile('\s+', re.MULTILINE)
_wsgtre = re.compile('\s+|>', re.MULTILINE)

def _find_ws(s, start=0, end=None):
    '''Find whitespace. Interface similar to str.find.'''
    if end is not None:
        s = s[start:end]
    else:
        s = s[start:]
    x = _wsre.search(s)
    if x:
        return x.start()+start
    else:
        return -1

def _find_ws_or_gt(s, start=0, end=None):
    '''Find whitespace or greater than ('>') sign. Interface similar to str.find.'''
    if end is not None:
        s = s[start:end]
    else:
        s = s[start:]
    x = _wsgtre.search(s)
    if x:
        return x.start()+start
    else:
        return -1

def summarize_html(html, maxwords = 25):
    if html is None:
        return ''
    tagopen = _tagopenre
    tagclose = _tagclosere
    tagre = _tagre
    tagself = _tagselfre
    tags = [0]
    taglist = []
    def tagopen_sub(match):
        tag = match.string[match.start():match.end()]
        taglist.append(tag)
        if _selfclosere.search(tag):
            r = '<%d/>'%tags[0]
        else:
            r = '<%d>'%tags[0]
        tags[0] += 1
        return r
    def tagclose_sub(match):
        r = '</%d>'%tags[0]
        taglist.append(match.string[match.start():match.end()])
        tags[0] += 1
        return r
    def tagself_sub(match):
        r = '<%d/>'%tags[0]
        taglist.append(match.string[match.start():match.end()])
        tags[0] += 1
        return r
    # preprocess text, fill taglist
    tagged = html.replace('&nbsp;', ' ')
    tagged = tagopen.sub(tagopen_sub, tagged)
    tagged = tagclose.sub(tagclose_sub, tagged)
    tagged = tagself.sub(tagself_sub, tagged)
    tagged = tagre.sub(r' <\1> ', tagged)
    # setup for processing
    splittags = tagged.split()
    alist = [None]*len(splittags)
    words = 0
    tagstack = []
    addspace = False
    do_not_add_dots = False
    is_table_row = 0    # misnamed: used to count nested table rows
    was_table_row = False
    for i,elem in enumerate(splittags):
        # modifying the list you're iterating is a crime, so we make ourselves
        # another one and fill it when needed
        alist[i] = elem

        # end condition
        if words >= maxwords and not is_table_row:
            # special case: tables
            if was_table_row:
                do_not_add_dots = True
            break
        # an usual word
        if not elem.startswith('<'):
            words += 1
            if addspace:
                alist[i] = ' '+elem
            addspace = True
        # a opening tag
        elif not elem.startswith('</') and not elem.endswith('/>'):
            tag = taglist[int(elem[1:-1])]
            tested = tag.strip()
            if tested[:3] == '<tr' and _find_ws(tested[:4]) in (-1, 3):
                is_table_row += 1
            # comment tags need not be closed
            if not tested.startswith('<!'):
                tagstack.append(tag)
            alist[i] = tag
            addspace = False
        # a closing tag
        elif not elem.endswith('/>'):
            addspace = False
            try:
                top = tagstack[-1]
            except IndexError:
                raise ValueError('tag not opened: %s'%elem)
            # extract the tagname from top of the stack
            top = top[top.find('<')+1:top.find('>')]
            cut = _find_ws(top)
            if cut > 0:
                top = top[:cut]
            # extract the tagname from the tag list
            fromlist = taglist[int(elem[2:-1])]
            tag = fromlist[fromlist.find('/')+1:fromlist.find('>')]

            if top != tag:
                raise ValueError('tag not closed properly: %s, got %s'%(top, tag))
            # special case: tables
            # some other tags could use special-casing, like dt/dd
            if top == 'tr':
                is_table_row -= 1
                was_table_row = True
                if is_table_row < 0:
                    is_table_row = 0
            else:
                was_table_row = False
            # close the tag
            tagstack.pop()
            alist[i] = fromlist
        # a self-closing tag
        else:
            tag = taglist[int(elem[1:-2])]
            alist[i] = tag
            addspace = False
    else:
        do_not_add_dots = True
        i += 1
    if words < maxwords:
        # normalize whitespace (actually not needed... makes the tests pass, though)
        return _wsre.sub(' ', html)
    # take care so no monstrousities like '......' appear at the end of output
    if alist[i-1][-1] in ('.', ',', ':', '?', '!'):
        alist[i-1] = re.sub(r'[\.,:?!]+$', '', alist[i-1])
    # close remaining open tags
    tagstack.reverse()
    if not do_not_add_dots:
        result = alist[:i] + ['...'] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
    elif was_table_row:
        result = alist[:i] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
    else:
        result = alist[:i] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
    # normalize whitespace...
    r = _wsre.sub(' ', ''.join(result))
    return r

This recipe was created to summarize html articles written in tinymce embedded in a django website. Approach taken is probably suboptimal, but didn't require an advanced, stateful lexer.

Known issues:

  • There's a special case for tables, so rows don't end abruptly.
  • Comments are not handled correctly in all cases - whitespace before and/or after a comment may not be retained. Empty tags may also have whitespace issues.

1 comment

a 11 years, 7 months ago  # | flag

This horrible expression, which appears 3 times: match.string[match.start():match.end()] should be replaced with just match.group()

Created by Marek Baczynski on Tue, 19 Dec 2006 (PSF)
Python recipes (4591)
Marek Baczynski's recipes (2)

Required Modules

Other Information and Tasks