Welcome, guest | Sign In | My Account | Store | Cart

READ BEFORE YOU USE THE CODE

Requirements

  1. Windows platform
  2. Python 2.7
  3. pywin32, http://sourceforge.net/projects/pywin32/
  4. Word application installed on running machine
Python, 47 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# a script that converts word file to txt files
# requires word application on Windows machine
# requirement:
#    1. Windows platform
#    2. python 2.7
#    3. pywin32, download from http://sourceforge.net/projects/pywin32/
#    4. word application installed on running machine
from win32com.client import constants, Dispatch
import pythoncom
import glob
import os
from zipfile import ZipFile

# convert the word file to a text file.
# @arg wordapp: The word IDispatch object
# @arg wordfile: The word file name
# @returns: The txt file name
def convert_to_text(wordapp, wordfile):
    name, ext = os.path.splitext(wordfile)
    if ext != '.doc' and ext != '.docx':
        return None
    txtfile = name + '.txt'
    print txtfile
    wordapp.Documents.Open(os.path.abspath(wordfile))
    wdFormatTextLineBreaks = 3
    wordapp.ActiveDocument.SaveAs(os.path.abspath(txtfile), 
	                              FileFormat=wdFormatTextLineBreaks)
    wordapp.ActiveDocument.Close()
    return txtfile

# a generator that iterates all doc files in the current work dir
def next_doc():
    for d in glob.glob('*.doc'):
        yield d
    for d in glob.glob('*.docx'):
        yield d

# convert all doc/docx files and zip all output txt files as the zipfilename
def convert_and_zip(zipfilename):
    word = Dispatch("Word.Application")
    with ZipFile(zipfilename, 'w') as fzip:
        for doc in next_doc():
            print 'converting ', doc, '...'
            txtfile = convert_to_text(word, doc)
            if txtfile:
                fzip.write(txtfile)
    word.Quit()

3 comments

Vivek 11 years, 8 months ago  # | flag

I am getting an error on print command in this while compiling.. Any ideas?

Shao-chuan Wang (author) 11 years, 8 months ago  # | flag

Vivek, did you use python 2.7?

zarfishan zahid 11 years, 5 months ago  # | flag

you can use online free converter like http://saaspose.com/api/words to convert your doc file to text files easily and accurately.