This module provides a function to summarize a XHTML string - that is, shorten it so it is has no more than a given amount of words while keeping XHTML (or, hopefully, any XML) tags intact.
No third-party packages are required (as of Python 2.4, but should work in earlier versions.)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | # -*- encoding: utf-8 -*-
import re
_tagopenre = re.compile(r'(?P<starws>\s?)<(?P<tagname>[^/][^> /]*)(?P<tagcontents>[^>]*)>(?P<endws>\s?)',re.MULTILINE)
_tagclosere = re.compile(r'(?P<startws>\s?)</(?P<tagname>[^>]+)>(?P<endws>\s?)')
_tagselfre = re.compile(r'(?P<startws>\s?)<(?P<tag>[^\d/>][^/>]*)/>(?P<endws>\s?)',re.MULTILINE)
_tagre = re.compile(r'<([^>]+)>', re.MULTILINE)
_selfclosere = re.compile('/\s*>\s*$')
_wsre = re.compile('\s+', re.MULTILINE)
_wsgtre = re.compile('\s+|>', re.MULTILINE)
def _find_ws(s, start=0, end=None):
'''Find whitespace. Interface similar to str.find.'''
if end is not None:
s = s[start:end]
else:
s = s[start:]
x = _wsre.search(s)
if x:
return x.start()+start
else:
return -1
def _find_ws_or_gt(s, start=0, end=None):
'''Find whitespace or greater than ('>') sign. Interface similar to str.find.'''
if end is not None:
s = s[start:end]
else:
s = s[start:]
x = _wsgtre.search(s)
if x:
return x.start()+start
else:
return -1
def summarize_html(html, maxwords = 25):
if html is None:
return ''
tagopen = _tagopenre
tagclose = _tagclosere
tagre = _tagre
tagself = _tagselfre
tags = [0]
taglist = []
def tagopen_sub(match):
tag = match.string[match.start():match.end()]
taglist.append(tag)
if _selfclosere.search(tag):
r = '<%d/>'%tags[0]
else:
r = '<%d>'%tags[0]
tags[0] += 1
return r
def tagclose_sub(match):
r = '</%d>'%tags[0]
taglist.append(match.string[match.start():match.end()])
tags[0] += 1
return r
def tagself_sub(match):
r = '<%d/>'%tags[0]
taglist.append(match.string[match.start():match.end()])
tags[0] += 1
return r
# preprocess text, fill taglist
tagged = html.replace(' ', ' ')
tagged = tagopen.sub(tagopen_sub, tagged)
tagged = tagclose.sub(tagclose_sub, tagged)
tagged = tagself.sub(tagself_sub, tagged)
tagged = tagre.sub(r' <\1> ', tagged)
# setup for processing
splittags = tagged.split()
alist = [None]*len(splittags)
words = 0
tagstack = []
addspace = False
do_not_add_dots = False
is_table_row = 0 # misnamed: used to count nested table rows
was_table_row = False
for i,elem in enumerate(splittags):
# modifying the list you're iterating is a crime, so we make ourselves
# another one and fill it when needed
alist[i] = elem
# end condition
if words >= maxwords and not is_table_row:
# special case: tables
if was_table_row:
do_not_add_dots = True
break
# an usual word
if not elem.startswith('<'):
words += 1
if addspace:
alist[i] = ' '+elem
addspace = True
# a opening tag
elif not elem.startswith('</') and not elem.endswith('/>'):
tag = taglist[int(elem[1:-1])]
tested = tag.strip()
if tested[:3] == '<tr' and _find_ws(tested[:4]) in (-1, 3):
is_table_row += 1
# comment tags need not be closed
if not tested.startswith('<!'):
tagstack.append(tag)
alist[i] = tag
addspace = False
# a closing tag
elif not elem.endswith('/>'):
addspace = False
try:
top = tagstack[-1]
except IndexError:
raise ValueError('tag not opened: %s'%elem)
# extract the tagname from top of the stack
top = top[top.find('<')+1:top.find('>')]
cut = _find_ws(top)
if cut > 0:
top = top[:cut]
# extract the tagname from the tag list
fromlist = taglist[int(elem[2:-1])]
tag = fromlist[fromlist.find('/')+1:fromlist.find('>')]
if top != tag:
raise ValueError('tag not closed properly: %s, got %s'%(top, tag))
# special case: tables
# some other tags could use special-casing, like dt/dd
if top == 'tr':
is_table_row -= 1
was_table_row = True
if is_table_row < 0:
is_table_row = 0
else:
was_table_row = False
# close the tag
tagstack.pop()
alist[i] = fromlist
# a self-closing tag
else:
tag = taglist[int(elem[1:-2])]
alist[i] = tag
addspace = False
else:
do_not_add_dots = True
i += 1
if words < maxwords:
# normalize whitespace (actually not needed... makes the tests pass, though)
return _wsre.sub(' ', html)
# take care so no monstrousities like '......' appear at the end of output
if alist[i-1][-1] in ('.', ',', ':', '?', '!'):
alist[i-1] = re.sub(r'[\.,:?!]+$', '', alist[i-1])
# close remaining open tags
tagstack.reverse()
if not do_not_add_dots:
result = alist[:i] + ['...'] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
elif was_table_row:
result = alist[:i] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
else:
result = alist[:i] + ['</%s>'%x[x.find('<')+1:_find_ws_or_gt(x,1)] for x in tagstack]
# normalize whitespace...
r = _wsre.sub(' ', ''.join(result))
return r
|
This recipe was created to summarize html articles written in tinymce embedded in a django website. Approach taken is probably suboptimal, but didn't require an advanced, stateful lexer.
Known issues:
- There's a special case for tables, so rows don't end abruptly.
- Comments are not handled correctly in all cases - whitespace before and/or after a comment may not be retained. Empty tags may also have whitespace issues.
This horrible expression, which appears 3 times:
match.string[match.start():match.end()]
should be replaced with justmatch.group()