#!/usr/bin/env python """ This little script presents new heise-news-articles individually by title and asks if it should download the corresponding mp3-file. """ import threading import Queue import os import feedparser from urllib import urlretrieve #-----------------------------------------------------------------------------# n_threads = 10 feed_url = "http://www.heise.de/newsticker/heise.rdf" left_link = "http://www.heise.de/fastbin/audio_download" \ "?meldung=http://www.heise.de/newsticker/meldung/" try: archive_filename = "%s/.heise" % os.environ["HOME"] except KeyError: archive_filename = "%s%sheise_archive" % (os.environ["HOMEPATH"], os.sep) #-----------------------------------------------------------------------------# class Downloader(threading.Thread): """ Class for worker-threads that download files. Don't tell Marx! """ def __init__(self, links_filenames): threading.Thread.__init__(self) self.setDaemon(True) self.links_filenames = links_filenames self.start() #-------------------------------------------------------------------------# def run(self): while True: link, filename = self.links_filenames.get() urlretrieve(link, filename) self.links_filenames.task_done() #-----------------------------------------------------------------------------# class Archive(object): def __init__(self): feed = feedparser.parse(feed_url) try: archive_file = open(archive_filename) old_links = archive_file.readlines() self.old_links = [link.strip() for link in old_links] archive_file.close() except IOError: self.old_links = [] self.entries_i = range(len(feed["entries"])) self.feed_links = [feed["entries"][entry_i]["link"].encode("utf-8") for entry_i in self.entries_i] self.feed = feed #-------------------------------------------------------------------------# def get_new_entries(self): new_links = [link for link in self.feed_links if link not in self.old_links] titles = [self.feed["entries"][entry_i]["title"].encode("utf-8") for entry_i in self.entries_i if self.feed["entries"][entry_i]["link"].encode("utf-8") in new_links] # the article_id is in the link between "meldung/" and "/from" article_ids = [link.split("meldung/")[1].split("/from")[0] for link in new_links] return new_links, titles, article_ids #-------------------------------------------------------------------------# def store(self): archive_file = open(archive_filename, "w") archive_file.writelines("\n".join(self.feed_links)) archive_file.close() #-----------------------------------------------------------------------------# def prepare_workers(): links_filenames = Queue.Queue() return [Downloader(links_filenames) for ii in range(n_threads)][0] #-----------------------------------------------------------------------------# def start_download(link, title, id, downloader): for bad, good in zip(("/", ":", " ", '"', "?"), ("", "", "_", "", "")): title = title.replace(bad, good) filename = "heise_%s_%s.mp3" % (id, title) mp3_link = left_link + id downloader.links_filenames.put((mp3_link, filename)) #-----------------------------------------------------------------------------# if __name__ == "__main__": downloader = prepare_workers() feed_archive = Archive() links, titles, ids = feed_archive.get_new_entries() for link, title, id in zip(links, titles, ids): download_yn = None while download_yn != "y" and download_yn != "n" and download_yn != "c": print title download_yn = raw_input('Download mp3? (y/[n]/c)') if download_yn == "": download_yn = "n" if download_yn == "y": start_download(link, title, id, downloader) if download_yn == "c": break if links: print "Waiting for downloads to end..." downloader.links_filenames.join() feed_archive.store()