Welcome, guest | Sign In | My Account | Store | Cart

The Script converts all doc and docx files in a specified folder to pdf files. It checks whether the provided absolute path does actually exist and whether the specified folder contains any doc and docx files. It does not travers the directory recursively. The script is not portable and runs only a Windows machine. Based on the experience I made, I recommend closing MS Word before running the script.

Python, 94 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from os import chdir, getcwd, listdir, path
from time import strftime
from win32com import client

def count_files(filetype):
    ''' (str) -> int
    Returns the number of files given a specified file type.
    >>> count_files(".docx")
    11
    '''
    count_files = 0
    for files in listdir(folder):
        if files.endswith(filetype):
            count_files += 1
    return count_files

# Function "check_path" is used to check whether the path the user provided does
# actually exist. The user is prompted for a path until the existence of the
# provided path has been verified.

def check_path(prompt):
    ''' (str) -> str
    Verifies if the provided absolute path does exist.
    '''
    abs_path = raw_input(prompt)
    while path.exists(abs_path) != True:
        print "\nThe specified path does not exist.\n"
        abs_path = raw_input(prompt)
    return abs_path    
    
print "\n"

folder = check_path("Provide absolute path for the folder: ")

# Change the directory.

chdir(folder)

# Count the number of docx and doc files in the specified folder.

num_docx = count_files(".docx")
num_doc = count_files(".doc")

# Check if the number of docx or doc files is equal to 0 (= there are no files
# to convert) and if so stop executing the script. 

if num_docx + num_doc == 0:
    print "\nThe specified folder does not contain docx or docs files.\n"
    print strftime("%H:%M:%S"), "There are no files to convert. BYE, BYE!."
    exit()
else:
    print "\nNumber of doc and docx files: ", num_docx + num_doc, "\n"
    print strftime("%H:%M:%S"), "Starting to convert files ...\n"

# Try to open win32com instance. If unsuccessful return an error message.

try:
    word = client.DispatchEx("Word.Application")
    for files in listdir(getcwd()):
        if files.endswith(".docx"):
            new_name = files.replace(".docx", r".pdf")
            in_file = path.abspath(folder + "\\" + files)
            new_file = path.abspath(folder + "\\" + new_name)
            doc = word.Documents.Open(in_file)
            print strftime("%H:%M:%S"), " docx -> pdf ", path.relpath(new_file)
            doc.SaveAs(new_file, FileFormat = 17)
            doc.Close()
        if files.endswith(".doc"):
            new_name = files.replace(".doc", r".pdf")
            in_file = path.abspath(folder + "\\" + files)
            new_file = path.abspath(folder + "\\" + new_name)
            doc = word.Documents.Open(in_file)
            print strftime("%H:%M:%S"), " doc  -> pdf ", path.relpath(new_file)
            doc.SaveAs(new_file, FileFormat = 17)
            doc.Close()
except Exception, e:
    print e
finally:
    word.Quit()

print "\n", strftime("%H:%M:%S"), "Finished converting files."    

# Count the number of pdf files.

num_pdf = count_files(".pdf")   

print "\nNumber of pdf files: ", num_pdf

# Check if the number of docx and doc file is equal to the number of files.

if num_docx + num_doc == num_pdf:
    print "\nNumber of doc and docx files is equal to number of pdf files."
else:
    print "\nNumber of doc and docx files is not equal to number of pdf files."

12 comments

Ofek Lev 7 years, 7 months ago  # | flag

Nice code, though there is much more to converting file formats than just changing the extensions.

Fabian Mayer (author) 7 years, 7 months ago  # | flag

Hi Ofek!

The script does more than just changing the extensions. It opens the .docx or .doc file (without actually displaying it) and saves it as a pdf file. See line 66:

doc.SaveAs(new_file, FileFormat = 17)

17 is the value for a .pdf file. See "WdSaveFormat Enumeration" from the MS Developer Network website (http://msdn.microsoft.com/en-us/library/bb238158.aspx).

Ofek Lev 7 years, 7 months ago  # | flag

Ahh, missed that line completely. Very nice script! I would recommend this though to remove some boilerplate:

`
try:
    word = client.DispatchEx("Word.Application")
    for files in listdir(getcwd()):
        match = 0
        if files.endswith(".doc"): s, match = "doc", 1
        elif files.endswith(".docx"): s, match = "docx", 1
        if match:
            new_name = files.replace("."+s, r".pdf")
            in_file = path.abspath(folder + "\\" + files)
            new_file = path.abspath(folder + "\\" + new_name)
            doc = word.Documents.Open(in_file)
            print strftime("%H:%M:%S"), " " +s+ " -> pdf ", path.relpath(new_file)
            doc.SaveAs(new_file, FileFormat = 17)
            doc.Close()
except Exception, e:
    print e
finally:
    word.Quit()`
soliu 7 years, 6 months ago  # | flag

Nice code with great comment. But does it mean that I can use this code to convert HTML/htm file to PDF just by replacing doc/docx with HTML/htm ?

Rod Villanueva 7 years, 6 months ago  # | flag

Hi Fabian, I am using Python 2.7.6(32 bit version) and I tried the program but gave me the following error:

13:31:38 doc -> pdf HomesteadDeclaration_1-1.pdf (-2147352567, 'Exception occurred.', (0, u'Microsoft Word', u'Command failed', u'wdmain11.chm', 36966, -2146824090), None)

13:31:38 Finished converting files.

Number of pdf files: 3

Number of doc and docx files is not equal to number of pdf files.

Fabian Mayer (author) 7 years, 6 months ago  # | flag

@ soliu:

Sorry, for my late reply.

No. That will not work.

In line 66 I specify the file type:

doc.SaveAs(new_file, FileFormat = 17)

17 is the value for a .pdf file. See "WdSaveFormat Enumeration" from the MS Developer Network website (http://msdn.microsoft.com/en-us/library/bb238158.aspx).

Perhabs xhtml2pdf is what you are looking for (https://pypi.python.org/pypi/xhtml2pdf/)

CFS Formations 7 years, 5 months ago  # | flag

Hi Fabian, I am running Windows Server 2012, using Python 3.4.1 64-bit to run this script. After making the required changes from 2.x to 3.x, this script runs perfectly fine inside the Interactive Shell.

However, after creating a .bat file to run this and then calling the file inside php, it returns an error.

This line is returning the following error:

doc.SaveAs(new_file, FileFormat = 17)

Error: "'NoneType' object has no attribute 'SaveAs'"

Could you please advise where I am going wrong with this?

Thank you for any help.

achim 6 years, 4 months ago  # | flag

Here's an update for Python3 using input instead of raw_input and using print() vs. print

achim 6 years, 4 months ago  # | flag

Here's an update for Python3 - input vs. raw_input, using print() vs. print: Recipe 579080

Recipe 579080

dheeraj vadlani 6 years, 4 months ago  # | flag

How to run the above program on ubuntu....Plzz help

dheeraj vadlani 6 years, 4 months ago  # | flag

<script>alert("help me");</script>

Anudeep 6 years ago  # | flag

Hi, I tried your code and it works really well. Thank You. I was trying to modify it to make it run for multiple .doc/.docx files in subfolders, but it throws an error -

(-2147352571, 'Type mismatch.', None, 1)

Unable to figure out how this error can be rectified. Can you once look at my code below and suggest?

import re
import os
from os import chdir, getcwd, listdir, path
from os import walk
from time import strftime
from win32com import client
path = 'C:/Users/anudeep/Desktop/Sample_test'
word_file_names = []
def count_files(filetype):
    ''' (str) -> int
    Returns the number of filenames given a specified file type.
    >>> count_files(".docx")
    11
    '''
    count_files = 0
    for dirpath, dirnames, filenames in walk(path):
        print dirpath
        for f in filenames:
            if f.lower().endswith(filetype):
                count_files += 1
    return count_files
def check_path(prompt):
    ''' (str) -> str
    Verifies if the provided absolute path does exist.
    '''
    abs_path = raw_input(prompt)
    while path.exists(abs_path) != True:
        print "\nThe specified path does not exist.\n"
        abs_path = raw_input(prompt)
    return abs_path
try:
    word = client.DispatchEx("Word.Application")
    for dirpath, dirnames, filenames in os.walk(path):
        print dirpath
        for f in filenames:
            if f.lower().endswith(".docx") and re.search('Addendum', f):
                new_name = f.replace(".docx", r".pdf")
                in_file = word_file_names.append(dirpath + "\\" + f)
                new_file = word_file_names.append(dirpath + "\\" + new_name)
                doc = word.Documents.Open(in_file)
                print strftime("%H:%M:%S"), " docx -> pdf ", path.relpath(new_file)
                doc.SaveAs(new_file, FileFormat = 17)
                doc.Close()
            if f.lower().endswith(".doc") and re.search('Addendum', f):
                new_name = f.replace(".doc", r".pdf")
                in_file = word_file_names.append(dirpath + "\\" + f)
                new_file = word_file_names.append(dirpath + "\\" + new_name)
                doc = word.Documents.Open(in_file)
                print strftime("%H:%M:%S"), " doc  -> pdf ", path.relpath(new_file)
                doc.SaveAs(new_file, FileFormat = 17)
                doc.Close()
except Exception, e:
    print e
finally:
    word.Quit()
print "\n", strftime("%H:%M:%S"), "Finished converting files."