This little script presents new heise-news-articles individually by title and asks if it should download the corresponding mp3-file.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | #!/usr/bin/env python
"""
This little script presents new heise-news-articles individually by title
and asks if it should download the corresponding mp3-file.
"""
import threading
import Queue
import os
import feedparser
from urllib import urlretrieve
#-----------------------------------------------------------------------------#
n_threads = 10
feed_url = "http://www.heise.de/newsticker/heise.rdf"
left_link = "http://www.heise.de/fastbin/audio_download" \
"?meldung=http://www.heise.de/newsticker/meldung/"
try:
archive_filename = "%s/.heise" % os.environ["HOME"]
except KeyError:
archive_filename = "%s%sheise_archive" % (os.environ["HOMEPATH"], os.sep)
#-----------------------------------------------------------------------------#
class Downloader(threading.Thread):
""" Class for worker-threads that download files. Don't tell Marx! """
def __init__(self, links_filenames):
threading.Thread.__init__(self)
self.setDaemon(True)
self.links_filenames = links_filenames
self.start()
#-------------------------------------------------------------------------#
def run(self):
while True:
link, filename = self.links_filenames.get()
urlretrieve(link, filename)
self.links_filenames.task_done()
#-----------------------------------------------------------------------------#
class Archive(object):
def __init__(self):
feed = feedparser.parse(feed_url)
try:
archive_file = open(archive_filename)
old_links = archive_file.readlines()
self.old_links = [link.strip() for link in old_links]
archive_file.close()
except IOError:
self.old_links = []
self.entries_i = range(len(feed["entries"]))
self.feed_links = [feed["entries"][entry_i]["link"].encode("utf-8")
for entry_i in self.entries_i]
self.feed = feed
#-------------------------------------------------------------------------#
def get_new_entries(self):
new_links = [link for link in self.feed_links
if link not in self.old_links]
titles = [self.feed["entries"][entry_i]["title"].encode("utf-8")
for entry_i in self.entries_i
if self.feed["entries"][entry_i]["link"].encode("utf-8")
in new_links]
# the article_id is in the link between "meldung/" and "/from"
article_ids = [link.split("meldung/")[1].split("/from")[0]
for link in new_links]
return new_links, titles, article_ids
#-------------------------------------------------------------------------#
def store(self):
archive_file = open(archive_filename, "w")
archive_file.writelines("\n".join(self.feed_links))
archive_file.close()
#-----------------------------------------------------------------------------#
def prepare_workers():
links_filenames = Queue.Queue()
return [Downloader(links_filenames) for ii in range(n_threads)][0]
#-----------------------------------------------------------------------------#
def start_download(link, title, id, downloader):
for bad, good in zip(("/", ":", " ", '"', "?"), ("", "", "_", "", "")):
title = title.replace(bad, good)
filename = "heise_%s_%s.mp3" % (id, title)
mp3_link = left_link + id
downloader.links_filenames.put((mp3_link, filename))
#-----------------------------------------------------------------------------#
if __name__ == "__main__":
downloader = prepare_workers()
feed_archive = Archive()
links, titles, ids = feed_archive.get_new_entries()
for link, title, id in zip(links, titles, ids):
download_yn = None
while download_yn != "y" and download_yn != "n" and download_yn != "c":
print title
download_yn = raw_input('Download mp3? (y/[n]/c)')
if download_yn == "":
download_yn = "n"
if download_yn == "y":
start_download(link, title, id, downloader)
if download_yn == "c":
break
if links:
print "Waiting for downloads to end..."
downloader.links_filenames.join()
feed_archive.store()
|
Heise is a german technology news site that offers mp3-files for each article. Those audio files are generated by a text-to-speech system and are quite understandable. Downloading them manually usually ends in a clicking-orgy, since there does not seem to be a rss-feed that contains the mp3s.
I tried to keep the script short (<100 lines) while maintaining the kind of usability that is good enough for daily execution. I'm not sure if the handling of utf-8 in the "Archive"-Class and the filtering of bad characters in the "start_download"-function is conventional. I have to admit that I did not try to run the script on Windows. Suggestions how to do it better are appreciated. The performance seems to be good enough for the 200 items in the feed. For bigger feeds one would surely use a more intelligent approach to determine which item is new.