#!/usr/bin/env python
""" bier-soup.py: html tables -> text, a small example of BeautifulSoup
in: url or file like those in www.bier1.de
out:
1 Adldorf Graf Arco Pils Graf ?
...
"""
# Schierlinger Pils: Think global, drink regional
# (a general html table <-> py table would be nice and tough; see
# http://stackoverflow.com/questions/796490/python-method-to-extract-content-excluding-navigation-from-an-html-page
import re
from BeautifulSoup import BeautifulSoup
from collections import defaultdict
html = "http://www.bier1.de/L%E4nder%20gesamt/Bayern%20(Bavaria).htm"
# html = "Bier.html"
__date__ = "16jul 2009"
__author_email__ = "denis-bz-py@t-online.de"
Test = 1
#...............................................................................
def leaf( tag ):
""" get text inside ... text """
#
# + + + + +
# Bierfeuerwerk
for text in tag.findAll( text=True ):
text = text.strip()
if text: return text
return ""
def trow_cols( trow, td="td" ):
""" soup.table.tr -> leaf strings
"""
if Test >= 2:
print "test trow_cols:", trow.prettify()
cols = []
for col in trow( td ):
text = leaf( col )
text = re.sub( r"\s\s+", " ", text.strip() )
cols.append( text )
return cols
def plus_num( s ):
""" + + + + - -> 5- """
s = re.sub( r"\s+", "", s )
if not s:
return "- "
if s[0] != "+":
return s
return "%d%s" % (len(s), "-" if s[-1] == "-" else " ")
def biertext( row ):
name, plus, stadt, kommentar = trow_cols( row )
# [u'Kuchlbauer Helles Bier', u'+ + + + -', u'BY/Abensberg', u'Ein Schippe ...
stadt = re.sub( "^BY/", "", stadt )
note = plus_num( plus )
return "%s %s \t%s \t%s" % (note, stadt, name, kommentar)
#...............................................................................
if __name__ == "__main__":
import codecs
import sys
import urllib2
try:
import bz.util as ut
print ut.From()
ut.ptime()
jarg = ut.scan_eq_args( globals() ) # Test= ...
except ImportError:
ut = None
jarg = 1
if sys.argv[jarg:]:
html = sys.argv[jarg]
sys.stdout = codecs.getwriter("utf-8")(sys.__stdout__)
if html.startswith( "http:" ):
htmltext = urllib2.urlopen( html ) .read()
else:
htmltext = open( html ) .read() # ut.openplus
soup = BeautifulSoup( htmltext, convertEntities="html" )
if ut:
ut.ptime( "BeautifulSoup read %d bytes" % len(htmltext) )
# 1m Bier.html ~ 20 sec mac g4 ppc
table = soup.findAll( "table" )[1] # skip table[0]
rows = table( "tr" )
# row 0 usually has table headers, but not bier1.de
th = trow_cols( rows[0], "th" ) or trow_cols( rows[0] )
print "# table headers:", th
for row in rows[1:]:
print biertext( row )
# | sort -k1rn
# end bier-soup.py