comixGetter « Python recipes « ActiveState Code

Download daily comics from comics.com and ucomics/gocomics.com e.g. peanuts, dilbert, calvin & hobbes etc.

      '''
comixGetter.py - a python script to download daily comics from ucomics and comics .com's
author: sami
date: mar 07
todo: a. check if file already exists, b. save with correct file extension
'''

import httplib, re

re_date = '[a-zA-Z]+,\s+(\d+)\s+([a-zA-Z]+)\s+(\d+).+'

dates = {'Jan':'01', 'Feb':'02', 'Mar':'03', 'Apr':'04', 'May':'05', 'Jun':'06', 
         'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10', 'Nov':'11', 'Dec':'12'}

comics_dot_com_info = (
              ('peanuts', '/comics/', '.+(\/comics\/peanuts\/archive\/images\/peanuts\d+\.gif|\/comics\/peanuts\/archive\/images\/peanuts\d+\.jpg).+'),
              ('dilbert', '/comics/', '.+(\/comics\/dilbert\/archive\/images\/dilbert\d+\.gif|\/comics\/dilbert\/archive\/images\/dilbert\d+\.jpg).+'),
              ('bignate', '/comics/', '.+(\/comics\/bignate\/archive\/images\/bignate\d+\.gif|\/comics\/bignate\/archive\/images\/bignate\d+\.jpg).+'),
              ('drabble', '/comics/', '.+(\/comics\/drabble\/archive\/images\/drabble\d+\.gif|\/comics\/drabble\/archive\/images\/drabble\d+\.jpg).+'),
              ('franknernest', '/comics/', '.+(\/comics\/franknernest\/archive\/images\/franknernest\d+\.jpg).+'),
              ('monty', '/comics/', '.+(\/comics\/monty\/archive\/images\/monty\d+\.gif|\/comics\/monty\/archive\/images\/monty\d+\.jpg).+'),
              ('wizardofid', '/creators/', '.+(\/creators\/wizardofid\/archive\/images\/wizardofid\d+\.gif|\/creators\/wizardofid\/archive\/images\/wizardofid\d+\.jpg).+'),
             )

ucomics_dot_com_info = (
              ('doonesbury', '/comics/db/', 'db'),
              ('calvin_and_hobbes', '/comics/ch/', 'ch')
             )

#===============================
#download_others - random comic downloads
#
#===============================
def download_others():
    
    cnxn = connect('www.archiecomics.com')
    
    print "getting: Archie"
    
    cnxn.request("GET", '/')
    
    res = cnxn.getresponse()
            
    p = re.match(re_date, res.getheader("date"))
        
    comic_path = '/pops_shop/dailycomics/image' + p.group(1) + '.gif'
    
    #reconnecting since archiecomics.com closes connection after sending response
    cnxn = connect('www.archiecomics.com')
    
    cnxn.request("GET", comic_path)
                        
    res = cnxn.getresponse()
    
    igot = res.status, res.reason

    if res.status != "200" and res.reason != "OK":
      print 'continuing to next comic since i got: '
      print igot
      return
              
    f = open('archie'+"_"+p.group(1)+"_"+p.group(2)+"_"+p.group(3)+".gif", "wb")

    f.write(res.read())
    
    f.close()
    
    print "OK" 

#===============================
#connect(server) - connect to server
#
#===============================             
def connect(server):
    con = httplib.HTTPConnection(server, 80)
    con.connect()                                                      
    #print con
    return con

#===============================
#download_ucomics_dot_com - download comics from ucomics/gocomics servers
#
#===============================
def download_ucomics_dot_com():
        
    cnxn = connect('images.ucomics.com')
    
    cnxn.request("GET", '/')
    
    res = cnxn.getresponse()
        
    p = re.match(re_date, res.getheader("date"))
        
    for entry in ucomics_dot_com_info:
            
        print "getting: " + entry[0]
        
        comic_path = entry[1] + p.group(3) + '/'+ entry[2] + p.group(3)[2] + p.group(3)[3] + dates[p.group(2)] + p.group(1) + '.gif';
        
        #for ucomics.com, we need to reconnect everytime, server closes connection after sending a response        
        cnxn = connect('images.ucomics.com')
        
        cnxn.request("GET", comic_path)
                        
        res = cnxn.getresponse()
                        
        igot = res.status, res.reason
  
        if res.status != "200" and res.reason != "OK":
          print 'continuing to next comic since i got: '
          print igot
          continue
          
        f = open(entry[0]+"_"+p.group(1)+"_"+p.group(2)+"_"+p.group(3)+".gif", "wb")

        f.write(res.read())
        
        f.close()
        
        print "OK"  

#===============================
#download_comics_dot_com - download comics from comics.com servers
#
#===============================
def download_comics_dot_com():

    #Calling connect to url directly since for comics.com, the server does not close the connection itself after
    #sending a response
    cnxn = connect('comics.com')
    
    for entry in comics_dot_com_info:
      
      print "getting: " + entry[0]
            
      cnxn.request("GET", entry[1]+entry[0]+'/')
      
      res = cnxn.getresponse()
      igot = res.status, res.reason
      if res.status != "200" and res.reason != "OK":
        print 'continuing to next comic since i got: '
        print igot
        continue
      
      p = re.match(re_date, res.getheader("date"))
      
      iread = res.read()
      l = re.findall(entry[2], iread)
                 
      cnxn.request("GET", l[0])
      res = cnxn.getresponse()
      
      f = open(entry[0]+"_"+p.group(1)+"_"+p.group(2)+"_"+p.group(3)+".gif", "wb")

      f.write(res.read())
      
      f.close()
      
      print "OK"

#===============================
#execution
#
#===============================
download_comics_dot_com()
download_ucomics_dot_com()
download_others()

      

Why use this: Instead of going to each website and reading comics one by one, use this script to download 'today's' comics - for your own particular comic, just add the regular expression to "comics_dot_com_info" or "ucomics_dot_com_info"

Known issues: Always saves files with .gif extension - Will not work if the server changes it's comics' regex - does not check if file already exists

Tags: web

◄	Python recipes (4591)	►
◄	sami jan's recipes (2)	►

comixGetter (Python recipe) by sami jan
ActiveState Code (http://code.activestate.com/recipes/522983/)

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

comixGetter (Python recipe) by sami jan ActiveState Code (http://code.activestate.com/recipes/522983/)