import datetime
import string, os, sys, getopt
from xml.dom import minidom
__author__ = 'Luis Rei '
__homepage__ = 'http://luisrei.com'
__version__ = '1.0'
__date__ = '2008/03/23'
def convert(infile, outdir, authorDirs, categoryDirs):
"""Convert Wordpress Export File to multiple html files.
Keyword arguments:
infile -- the location of the Wordpress Export File
outdir -- the directory where the files will be created
authorDirs -- if true, create different directories for each author
categoryDirs -- if true, create directories for each category
"""
# First we parse the XML file into a list of posts.
# Each post is a dictionary
dom = minidom.parse(infile)
blog = [] # list that will contain all posts
for node in dom.getElementsByTagName('item'):
post = dict()
try:
post["title"] = node.getElementsByTagName('title')[0].firstChild.data
except AttributeError:
post['title'] = ''
post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
post["author"] = node.getElementsByTagName(
'dc:creator')[0].firstChild.data
post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data
if node.getElementsByTagName('content:encoded')[0].firstChild != None:
post["text"] = node.getElementsByTagName(
'content:encoded')[0].firstChild.data
else:
post["text"] = ""
# wp:attachment_url could be use to download attachments
# Get the categories
tempCategories = []
for subnode in node.getElementsByTagName('category'):
tempCategories.append(subnode.getAttribute('nicename'))
categories = [x for x in tempCategories if x != '']
post["categories"] = categories
# Add post to the list of all posts
blog.append(post)
# Then we create the directories and HTML files from the list of posts.
last_date = None
for post in blog:
date = post['date']
if last_date and '-0001' in date:
date = date.replace('-0001', str(last_date.year))
date = datetime.datetime.strptime(date, '%a, %d %b %Y %H:%M:%S +0000')
last_date = date
# The "category" directories
path = ""
if authorDirs == True:
path += post["author"].encode('utf-8') + "/"
# This creates a path for the file in the format
# category1/category2/category3/file. Note that the category list was
# sorted.
if categoryDirs == True:
if (post["categories"] != None):
path += string.join(post["categories"],"/")
if os.path.exists(path) == False and path != "":
os.makedirs(path)
# And finally the file itself
path = outdir + path
title = post["title"].encode('utf-8')
fn_date = date.strftime('%Y%m%d-%H%M%S')
filename = os.path.join(path, '{}-{}.html'.format(fn_date, title)).replace(' ', '_').strip().lower()
# Add a meta tag to specify charset (UTF-8) in the HTML file
meta = """"""
# Convert the unicode object to a string that can be written to a file
# with the proper encoding (UTF-8)
text = post["text"].encode('utf-8')
# Replace simple newlines with
+ newline so that the HTML file
# represents the original post more accuratelly
text = text.replace("\n", "
\n")
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
print 'creating {}'.format(dirname)
os.makedirs(dirname)
with open(filename, 'wb') as f:
f.write(meta+"\n")
# Add "HTML header"
start = "\n\n"+ title +"\n\n\n"
f.write(start)
f.write(text)
# Finalize HTML
end = "\n\n"
f.write(end)
def usage(pname):
"""Displays usage information
keyword arguments:
pname -- program name (e.g. obtained as argv[0])
"""
print """python %s [-hac] [-o outdir] infile
Converts a Wordpress Export File to multiple html files.
Options:
-h,--help\tDisplays this information.
-a,--authors\tCreate different directories for each author.
-c,--categories\tCreate directory structure from post categories.
-o,--outdir\tSpecify a directory for the output.
Example:
python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml
""" % (pname, pname)
def main(argv):
outdir = ""
authors = False
categories = False
try:
opts, args = getopt.getopt(
argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"])
except getopt.GetoptError, err:
print str(err)
usage(argv[0])
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage(argv[0])
sys.exit()
elif opt in ("-a", "--authors"):
authors = True
elif opt in ("-c", "--categories"):
categories = True
elif opt in ("-o", "--outdir"):
outdir = arg
infile = "".join(args)
if infile == "":
print "Error: Missing Argument: missing wordpress export file."
usage(argv[0])
sys.exit(3)
if outdir == "":
# Use the current directory
outdir = os.getcwd()
convert(infile, outdir, authors, categories)
if __name__ == "__main__":
main(sys.argv)