Welcome, guest | Sign In | My Account | Store | Cart

bier-soup.py reads html tables like those in http://www.bier1.de and writes plain text files, as a small example of BeautifulSoup

Python, 101 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
#!/usr/bin/env python
""" bier-soup.py: html tables -> text, a small example of BeautifulSoup
in: url or file like those in www.bier1.de
out:
    1   Adldorf     Graf Arco Pils      Graf ?
    ...
"""
# Schierlinger Pils: Think global, drink regional
# (a general html table <-> py table would be nice  and tough; see
# http://stackoverflow.com/questions/796490/python-method-to-extract-content-excluding-navigation-from-an-html-page


import re
from BeautifulSoup import BeautifulSoup
from collections import defaultdict

html = "http://www.bier1.de/L%E4nder%20gesamt/Bayern%20(Bavaria).htm"
# html = "Bier.html"
__date__ = "16jul 2009"
__author_email__ = "denis-bz-py@t-online.de"
Test = 1


#...............................................................................
def leaf( tag ):
    """ get text inside <tag><tag>... text """
    # <td>  <div><font size="2"><i><font color="#FFFFFF"> 
    #   + + + + +</font></i></font></div></td>
    # <td height="10"><font size="2"> Bierfeuerwerk <strong></strong> </font></td>

    for text in tag.findAll( text=True ):
        text = text.strip()
        if text:  return text
    return ""

def trow_cols( trow, td="td" ):
    """ soup.table.tr -> <td> leaf strings
    """
    if Test >= 2:
        print "test trow_cols:", trow.prettify()
    cols = []
    for col in trow( td ):
        text = leaf( col )
        text = re.sub( r"\s\s+", " ",  text.strip() )
        cols.append( text )
    return cols

def plus_num( s ):
    """ + + + + -  ->  5- """
    s = re.sub( r"\s+", "", s )
    if not s:
        return "- "
    if s[0] != "+":
        return s
    return "%d%s" % (len(s), "-" if s[-1] == "-"  else " ")

def biertext( row ):
    name, plus, stadt, kommentar = trow_cols( row )
    # [u'Kuchlbauer Helles Bier', u'+ + + + -', u'BY/Abensberg', u'Ein Schippe ...
    stadt = re.sub( "^BY/", "", stadt )
    note = plus_num( plus )
    return "%s  %s \t%s \t%s" % (note, stadt, name, kommentar)


#...............................................................................
if __name__ == "__main__":
    import codecs
    import sys
    import urllib2
    try:
        import bz.util as ut
        print ut.From()
        ut.ptime()
        jarg = ut.scan_eq_args( globals() )  # Test= ...
    except ImportError:
        ut = None
        jarg = 1
    if sys.argv[jarg:]:
        html = sys.argv[jarg]
    sys.stdout = codecs.getwriter("utf-8")(sys.__stdout__)

    if html.startswith( "http:" ):
        htmltext = urllib2.urlopen( html ) .read()
    else:
        htmltext = open( html ) .read()  # ut.openplus

    soup = BeautifulSoup( htmltext, convertEntities="html" )
    if ut:
        ut.ptime( "BeautifulSoup read %d bytes" % len(htmltext) )
            # 1m Bier.html ~ 20 sec mac g4 ppc

    table = soup.findAll( "table" )[1]  # skip table[0]
    rows = table( "tr" )
        # row 0 usually has <th> table headers, but not bier1.de
    th = trow_cols( rows[0], "th" ) or trow_cols( rows[0] )
    print "# table headers:", th
    for row in rows[1:]:
        print biertext( row )
    # | sort -k1rn

# end bier-soup.py

1 comment

Daniel Lepage 14 years, 8 months ago  # | flag

What's bz.util?