Convert Wordpress Export File to multiple html files « Python recipes

Converts Wordpress Export Files (XML) to multiple html files and optionally uses tags and authors to create a directory structure.

      import datetime
import string, os, sys, getopt
from xml.dom import minidom


__author__ = 'Luis Rei <luis.rei@gmail.com>'
__homepage__ = 'http://luisrei.com'
__version__ = '1.0'
__date__ = '2008/03/23'


def convert(infile, outdir, authorDirs, categoryDirs):
    """Convert Wordpress Export File to multiple html files.
    
    Keyword arguments:
    infile -- the location of the Wordpress Export File
    outdir -- the directory where the files will be created
    authorDirs -- if true, create different directories for each author
    categoryDirs -- if true, create directories for each category
    """
    # First we parse the XML file into a list of posts.
    # Each post is a dictionary
    
    dom = minidom.parse(infile)

    blog = [] # list that will contain all posts

    for node in dom.getElementsByTagName('item'):
    	post = dict()

        try:
            post["title"] = node.getElementsByTagName('title')[0].firstChild.data
        except AttributeError:
            post['title'] = '<no title>'

    	post["date"] = node.getElementsByTagName('pubDate')[0].firstChild.data
    	post["author"] = node.getElementsByTagName(
    	                'dc:creator')[0].firstChild.data
    	post["id"] = node.getElementsByTagName('wp:post_id')[0].firstChild.data
    	
    	if node.getElementsByTagName('content:encoded')[0].firstChild != None:
    	    post["text"] = node.getElementsByTagName(
    	                    'content:encoded')[0].firstChild.data
    	else:
    	    post["text"] = ""
    	
    	# wp:attachment_url could be use to download attachments

    	# Get the categories
    	tempCategories = []
    	for subnode in node.getElementsByTagName('category'):
    		 tempCategories.append(subnode.getAttribute('nicename'))
    	categories = [x for x in tempCategories if x != '']
    	post["categories"] = categories 

    	# Add post to the list of all posts
    	blog.append(post)
    	
    
    # Then we create the directories and HTML files from the list of posts.
    last_date = None
    
    for post in blog:
        date = post['date']
        if last_date and '-0001' in date:
            date = date.replace('-0001', str(last_date.year))
        
        date = datetime.datetime.strptime(date, '%a, %d %b %Y %H:%M:%S +0000')
        last_date = date

        # The "category" directories
        path = ""
        if authorDirs == True:
            path += post["author"].encode('utf-8') + "/"
        
        # This creates a path for the file in the format
        # category1/category2/category3/file. Note that the category list was
        # sorted.
        
        if categoryDirs == True:
            if (post["categories"] != None):
                path += string.join(post["categories"],"/")

        if os.path.exists(path) == False and path != "":
            os.makedirs(path)
        
        # And finally the file itself
        path = outdir + path
        title = post["title"].encode('utf-8')
        fn_date = date.strftime('%Y%m%d-%H%M%S')
        filename = os.path.join(path, '{}-{}.html'.format(fn_date, title)).replace(' ', '_').strip().lower()
        
        # Add a meta tag to specify charset (UTF-8) in the HTML file
        meta = """<META http-equiv="Content-Type" content="text/html; \
        charset=UTF-8">"""

        # Convert the unicode object to a string that can be written to a file
        # with the proper encoding (UTF-8)
        text = post["text"].encode('utf-8')
        
        # Replace simple newlines with <br/> + newline so that the HTML file
        # represents the original post more accuratelly
        text = text.replace("\n", "<br/>\n")

        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            print 'creating {}'.format(dirname)
            os.makedirs(dirname)

        with open(filename, 'wb') as f:
            f.write(meta+"\n")
        
            # Add "HTML header"
            start = "<html>\n<head>\n<title>"+ title +"</title>\n</head>\n<body>\n"
            f.write(start)
            
            f.write(text)
            
            # Finalize HTML
            end = "\n</body>\n</html>"
            f.write(end)

def usage(pname):
    """Displays usage information
    
    keyword arguments:
    pname -- program name (e.g. obtained as argv[0])
    
    """
    
    
    print """python %s [-hac] [-o outdir] infile
    Converts a Wordpress Export File to multiple html files.
    
    Options:
        -h,--help\tDisplays this information.
        -a,--authors\tCreate different directories for each author.
        -c,--categories\tCreate directory structure from post categories.
        -o,--outdir\tSpecify a directory for the output.
        
    Example:
    python %s -c -o ~/TEMP ~/wordpress.2008-03-20.xml
        """ % (pname, pname)


def main(argv):
    outdir = ""
    authors = False
    categories = False
	
    try:
		opts, args = getopt.getopt(
		    argv[1:], "ha:o:c", ["help", "authors", "outdir", "categories"])	
    except getopt.GetoptError, err:
		print str(err)
		usage(argv[0])
		sys.exit(2)
	
    for opt, arg in opts:
		if opt in ("-h", "--help"):
			usage(argv[0])
			sys.exit()
		elif opt in ("-a", "--authors"):
			authors = True
		elif opt in ("-c", "--categories"):
		    categories = True
		elif opt in ("-o", "--outdir"):
		    outdir = arg
		
    infile = "".join(args)
	
    if infile == "":
	    print "Error: Missing Argument: missing wordpress export file."
	    usage(argv[0])
	    sys.exit(3)
	
    if outdir == "":
	    # Use the current directory
	    outdir = os.getcwd()
	
    convert(infile, outdir, authors, categories)
	

if __name__ == "__main__":
	main(sys.argv)

      

I used this to export my notes out of a wordpress blog (and into evernote.com).

Issues/notes: Currently does not handle images, attachments or comments. Was only tested on MacOS X (10.5) Was not "carefully" developed e.g. poor exception handling, little testing, ...

For getting a Wordpress Export File see http://wordpress.com/blog/2006/06/12/xml-import-export/

Tags: xml

◄	Python recipes (4591)	►
◄	rreboto's recipes (1)	►

Convert Wordpress Export File to multiple html files (Python recipe) by rreboto
Forked from Recipe 551792 (Some cleanup, added support for posts without titles, changed output filename format)

ActiveState Code (http://code.activestate.com/recipes/578892/)

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Convert Wordpress Export File to multiple html files (Python recipe) by rreboto Forked from Recipe 551792 (Some cleanup, added support for posts without titles, changed output filename format) ActiveState Code (http://code.activestate.com/recipes/578892/)