Yhis recipe shows how to convert Microsoft Word .DOCX files to PDF format, using the python-docx library and my xtopdf toolkit for PDF creation.
Note: The recipe has some limitations. E.g. fonts, tables, etc. from the input DOCX file are not preserved in the output PDF file.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | # DOCXtoPDF.py
# Author: Vasudev Ram - http://www.dancingbison.com
# Copyright 2012 Vasudev Ram, http://www.dancingbison.com
# This is open source code, released under the New BSD License -
# see http://www.opensource.org/licenses/bsd-license.php .
# This program uses the python-docx library, available at:
# https://github.com/mikemaccana/python-docx
import sys
import os
import os.path
import string
from textwrap import TextWrapper
from docx import opendocx, getdocumenttext
from PDFWriter import PDFWriter
def docx_to_pdf(infilename, outfilename):
# Extract the text from the DOCX file object infile and write it to
# a PDF file.
try:
infil = opendocx(infilename)
except Exception, e:
print "Error opening infilename"
print "Exception: " + repr(e) + "\n"
sys.exit(1)
paragraphs = getdocumenttext(infil)
pw = PDFWriter(outfilename)
pw.setFont("Courier", 12)
pw.setHeader("DOCXtoPDF - convert text in DOCX file to PDF")
pw.setFooter("Generated by xtopdf and python-docx")
wrapper = TextWrapper(width=70, drop_whitespace=False)
# For Unicode handling.
new_paragraphs = []
for paragraph in paragraphs:
new_paragraphs.append(paragraph.encode("utf-8"))
for paragraph in new_paragraphs:
lines = wrapper.wrap(paragraph)
for line in lines:
pw.writeLine(line)
pw.writeLine("")
pw.savePage()
pw.close()
def usage():
return "Usage: python DOCXtoPDF.py infile.docx outfile.txt\n"
def main():
try:
# Check for correct number of command-line arguments.
if len(sys.argv) != 3:
print "Wrong number of arguments"
print usage()
sys.exit(1)
infilename = sys.argv[1]
outfilename = sys.argv[2]
# Check for right infilename extension.
infile_ext = os.path.splitext(infilename)[1]
if infile_ext.upper() != ".DOCX":
print "Input filename extension should be .DOCX"
print usage()
sys.exit(1)
# Check for right outfilename extension.
outfile_ext = os.path.splitext(outfilename)[1]
if outfile_ext.upper() != ".PDF":
print "Output filename extension should be .PDF"
print usage()
sys.exit(1)
docx_to_pdf(infilename, outfilename)
except Exception, e:
sys.stderr.write("Error: " + repr(e) + "\n")
sys.exit(1)
if __name__ == '__main__':
main()
# EOF
Run the program with a command of the form:
python DOCXtoPDF.py infilename.docx outfilename.pdf
|
To use this recipe, you have to install Reportlab, then xtopdf, and also python-docx.
Thanks to the author of the python-docx library.
More details here at:
http://jugad2.blogspot.in/2013/10/convert-microsoft-word-files-to-pdf.html