把html转化为纯文本
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | #coding:utf-8
import HTMLParser
html=HTMLParser.HTMLParser
class MyHtmlparser(html):
def __init__(self):
html.__init__(self)
self.lidata=[]
self.dic={}#用来登记获得的tag和其相应的属性
def handle_data(self,data):
self.lidata.append(data)
def handle_starttag(self,tag,attrs):
self.dic[tag]=attrs
def handle_endtag(self,tag):
pass
mydir="c://html//"
f=file(mydir+"1.htm")
in_data=f.read()
f.close()
my = MyHtmlparser()
my.feed(in_data)
for i in my.lidata:
print i
for i in my.dic:
print i
|