Grab a part of a web page and generarte a new page with a base href to the source server, so that relative links will still work.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | #Python Version 2.1
#
#
#we need the following module
import httplib
# request the page
def GetUrl(ServerAdr,PagePath):
http = httplib.HTTP(ServerAdr)
http.putrequest('GET', PagePath)
http.putheader('Accept', 'text/html')
http.putheader('Accept', 'text/plain')
http.endheaders()
httpcode, httpmsg, headers = http.getreply()
if httpcode != 200:
raise "Could not get document: Check URL and Path."
doc = http.getfile()
data = doc.read() # read file
doc.close()
return data
#parse the page and return the part between the start and end token
def ExtractData(in_string, start_line, end_line):
lstr=in_string.splitlines() #split
j=0 #set counter to zero
for i in lstr:
j=j+1
if i.strip() == start_line: slice_start=j #find slice start
elif i.strip() == end_line: slice_end=j #find slice end
return lstr[slice_start:slice_end] #return slice
#handle the returned stuff and generate a new page
def main():
# parameter and constants
ServerAdr='www.heise.de'
PagePath='/'
StartLine='<!-- MITTE (NEWS) -->'
EndLine='<!-- MITTE (NEWS-UEBERBLICK) -->'
Head1='<html><head><base href="http://'
Head2='"></head><body>'
Foot='</body></html>'
# call functions
RawData=GetUrl(ServerAdr, PagePath)
v=ExtractData(RawData, StartLine, EndLine)
#
# return result and construct page
print Head1.strip()+ServerAdr.strip()+Head2.strip()
for i in v:
print i.strip()
print Foot.strip()
#call main function
main()
|
This was only a quick test hack. So there is no necessary error handling, the parameters are configured in the main function and the result is returned to standard output - but it might help somebody.
Tags: web