This recipe shows how to build a generic resource logger of webpages using PyGtkWebkit. The demo code implements a generic downloader for flash-video sites.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | #!/usr/bin/python
"""
Log resources requested by a webpage using WebKit.
Originally designed to download video files requested by Adobe Flash videos.
"""
import sys
import re
# Third-party modules
import webkit
import gtk
# Supported sites (key: URL regexp, value: video URL regexp)
SITES = {
"youtube\.com/": "youtube\.com/videoplayback",
"blip\.tv/": "blip\.tv/file/get/",
}
def debug(line):
"""Write debug line to standard error."""
sys.stderr.write("--- %s\n" % line)
def first(it):
"""Return first element in iterator (None if empty)."""
return next(it, None)
def on_request(view, frame, resource, request, response,
resource_regexp, skip_regexp=None):
"""Check if requested resource matches the video resource_regexp regexp."""
url = request.get_uri()
message = request.get_property("message")
if not message:
return
method = message.get_property("method")
if skip_regexp and skip_regexp.search(url):
# cancel the request
request.set_uri("about:blank")
return
debug("request: %s %s" % (method, url))
if resource_regexp and re.search(resource_regexp, url):
debug("videofile match: %s" % url)
print url
gtk.main_quit()
def create_webview():
"""Create a gtk.Window containing a WebKit webview."""
view = webkit.WebView()
window = gtk.Window()
scrolled = gtk.ScrolledWindow()
scrolled.add(view)
window.add(scrolled)
return window, view
def main(args):
import optparse
usage = """usage: %%prog [Options]\n\n%s""" % __doc__.strip()
parser = optparse.OptionParser(usage)
parser.add_option('-t', '--test', dest='test', action="store_true",
default=False, help="Run in test mode (show webview)")
options, args0 = parser.parse_args(args)
url, = args0
resource_regexp = first(pattern for (urlre, pattern) in SITES.iteritems()
if re.search(urlre, url))
if not resource_regexp and not options.test:
debug("No module found for URL: %s" % url)
return 1
window, webview = create_webview()
skip_regexp = re.compile(r"\.(jpg|png|gif|css)(\?|$)", re.I)
webview.connect("resource-request-starting", on_request, resource_regexp, skip_regexp)
webview.load_uri(url)
if options.test:
window.resize(640, 480)
window.show_all()
gtk.main()
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
|
Resource downloaders (in this example the referent would be http://code.google.com/p/get-flash-videos/) on some websites are difficult to develop and maintain, as they must replicate (and follow any change) HTTP requests (and maybe deal with Javascript). This other approach -based on programmatic browsers- have indeed much more dependencies, use plenty of resources, but the code is usually straighforward. Here we could virtually add support for a new website in a matter of minutes (we would just need the regular expression that matches the video file) no matter how complicated or ofuscated the webpage code is.
Note that if the webpage requires any user action to download the resource it may get complicated. While interacting with HTML elements in a page (forms, buttons) is very easy, firing events inside Flash objects requires extra work.
nice - I made something similar using qt-webkit and will borrow some of your ideas.
Richard, I maintain a Python wrapper around QtWebKit, you might find something interesting there: http://code.google.com/p/spynner/