Converts Wordpress Export Files (XML) to multiple html files and optionally uses tags and authors to create a directory structure.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | import datetime
import string, os, sys, getopt
from xml.dom import minidom
__author__ = 'Luis Rei <luis.rei@gmail.com>'
__homepage__ = 'http://luisrei.com'
__version__ = '1.0'
__date__ = '2008/03/23'
def convert(infile, outdir, authorDirs, categoryDirs):
"""Convert Wordpress Export File to multiple html files.
Keyword arguments:
infile -- the location of the Wordpress Export File
outdir -- the directory where the files will be created
authorDirs -- if true, create different directories for each author
categoryDirs -- if true, create directories for each category
"""
# First we parse the XML file into a list of posts.
# Each post is a dictionary
dom = minidom.parse(infile)
blog = [] # list that will contain all posts
for node in dom.getElementsByTagName('item'):
post = dict()
try:
post["title"] = node.getElementsByTagName('title')[0].firstChild.data
except AttributeError:
post['title'] = '<no title>'
post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
post["author"] = node.getElementsByTagName(
'dc:creator')[0].firstChild.data
post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data
if node.getElementsByTagName('content:encoded')[0].firstChild != None:
post["text"] = node.getElementsByTagName(
'content:encoded')[0].firstChild.data
else:
post["text"] = ""
# wp:attachment_url could be use to download attachments
# Get the categories
tempCategories = []
for subnode in node.getElementsByTagName('category'):
tempCategories.append(subnode.getAttribute('nicename'))
categories = [x for x in tempCategories if x != '']
post["categories"] = categories
# Add post to the list of all posts
blog.append(post)
# Then we create the directories and HTML files from the list of posts.
last_date = None
for post in blog:
date = post['date']
if last_date and '-0001' in date:
date = date.replace('-0001', str(last_date.year))
date = datetime.datetime.strptime(date, '%a, %d %b %Y %H:%M:%S +0000')
last_date = date
# The "category" directories
path = ""
if authorDirs == True:
path += post["author"].encode('utf-8') + "/"
# This creates a path for the file in the format
# category1/category2/category3/file. Note that the category list was
# sorted.
if categoryDirs == True:
if (post["categories"] != None):
path += string.join(post["categories"],"/")
if os.path.exists(path) == False and path != "":
os.makedirs(path)
# And finally the file itself
path = outdir + path
title = post["title"].encode('utf-8')
fn_date = date.strftime('%Y%m%d-%H%M%S')
filename = os.path.join(path, '{}-{}.html'.format(fn_date, title)).replace(' ', '_').strip().lower()
# Add a meta tag to specify charset (UTF-8) in the HTML file
meta = """<META http-equiv="Content-Type" content="text/html; \
charset=UTF-8">"""
# Convert the unicode object to a string that can be written to a file
# with the proper encoding (UTF-8)
text = post["text"].encode('utf-8')
# Replace simple newlines with <br/> + newline so that the HTML file
# represents the original post more accuratelly
text = text.replace("\n", "<br/>\n")
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
print 'creating {}'.format(dirname)
os.makedirs(dirname)
with open(filename, 'wb') as f:
f.write(meta+"\n")
# Add "HTML header"
start = "<html>\n<head>\n<title>"+ title +"</title>\n</head>\n<body>\n"
f.write(start)
f.write(text)
# Finalize HTML
end = "\n</body>\n</html>"
f.write(end)
def usage(pname):
"""Displays usage information
keyword arguments:
pname -- program name (e.g. obtained as argv[0])
"""
print """python %s [-hac] [-o outdir] infile
Converts a Wordpress Export File to multiple html files.
Options:
-h,--help\tDisplays this information.
-a,--authors\tCreate different directories for each author.
-c,--categories\tCreate directory structure from post categories.
-o,--outdir\tSpecify a directory for the output.
Example:
python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml
""" % (pname, pname)
def main(argv):
outdir = ""
authors = False
categories = False
try:
opts, args = getopt.getopt(
argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"])
except getopt.GetoptError, err:
print str(err)
usage(argv[0])
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage(argv[0])
sys.exit()
elif opt in ("-a", "--authors"):
authors = True
elif opt in ("-c", "--categories"):
categories = True
elif opt in ("-o", "--outdir"):
outdir = arg
infile = "".join(args)
if infile == "":
print "Error: Missing Argument: missing wordpress export file."
usage(argv[0])
sys.exit(3)
if outdir == "":
# Use the current directory
outdir = os.getcwd()
convert(infile, outdir, authors, categories)
if __name__ == "__main__":
main(sys.argv)
|
I used this to export my notes out of a wordpress blog (and into evernote.com).
Issues/notes: Currently does not handle images, attachments or comments. Was only tested on MacOS X (10.5) Was not "carefully" developed e.g. poor exception handling, little testing, ...
For getting a Wordpress Export File see http://wordpress.com/blog/2006/06/12/xml-import-export/