Recipe 303227 revision 2 « ActiveState Code

import email
import email.FeedParser
import re
import sys
import sgmllib

# How much of the text must be outside the ASCII range
# before we guess that it's a binary part. Threshold
# picked almost at random.
kGuessBinaryThreshold=0.2
kGuessBinaryRE=re.compile("[\\0000-\\0025\\0200-\\0377]") # Non ASCII characters

# How much of the text must be HTML tags before we guess
# that it's HTML. Threshold picked almost at random.
kGuessHTMLThreshold=0.05


# For stripping HTML tags. Very slightly modified from
# Alex Martelli's news post <9cpm4202cv1@news1.newsguy.com>
# of May 2, 2001, Subject: Stripping HTML tags from a string

class Cleaner(sgmllib.SGMLParser):
  entitydefs={"nbsp": " "} # I'll break if I want to

  def __init__(self):
    sgmllib.SGMLParser.__init__(self)
    self.result = []
  def do_p(self, *junk):
    self.result.append('\n')
  def do_br(self, *junk):
    self.result.append('\n')
  def handle_data(self, data):
    self.result.append(data)
  def cleaned_text(self):
    return ''.join(self.result)

def stripHTML(text):
  c=Cleaner()
  try:
    c.feed(text)
  except sgmllib.SGMLParseError:
    return text
  else:
    t=c.cleaned_text()
    return t


def guessIsBinary(text):
  lt=len(text)
  if lt==0:
    return False
  nMatches=float(len(kGuessBinaryRE.findall(text)))
  return nMatches/lt>=kGuessBinaryThreshold

# This does some relatively expensive parsing to
# try to figure out if the text is HTML. In cases
# in which it's used often, a simple regular
# expression would be faster and might be
# sufficiently accurate.
def guessIsHTML(text):
  lt=len(text)
  if lt==0:
    return False
  textWithoutTags=stripHTML(text)
  tagsChars=float(lt-len(textWithoutTags))
  if tagsChars==0:
    return False
  return lt/tagsChars>=kGuessHTMLThreshold

def getMungedMessage(openFile):
  openFile.seek(0)
  p=email.FeedParser.FeedParser()
  p.feed(openFile.read())
  m=p.close()

  # Fix up multipart content-type when message isn't multi-part
  if m.get_content_maintype()=="multipart" and not m.is_multipart():
    
    t=m.get_payload(decode=1)

    if guessIsBinary(t):
      # Use generic "opaque" type
      m.set_type("application/octet-stream")
    elif guessIsHTML(t):
      m.set_type("text/html")
    else:
      m.set_type("text/plain")

  return m
Recipe 303227 revision 2

History

Accounts

Code Recipes

Feedback & Information

ActiveState