READ BEFORE YOU USE THE CODE
Requirements
- Windows platform
- Python 2.7
- pywin32, http://sourceforge.net/projects/pywin32/
- Word application installed on running machine
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | # a script that converts word file to txt files
# requires word application on Windows machine
# requirement:
# 1. Windows platform
# 2. python 2.7
# 3. pywin32, download from http://sourceforge.net/projects/pywin32/
# 4. word application installed on running machine
from win32com.client import constants, Dispatch
import pythoncom
import glob
import os
from zipfile import ZipFile
# convert the word file to a text file.
# @arg wordapp: The word IDispatch object
# @arg wordfile: The word file name
# @returns: The txt file name
def convert_to_text(wordapp, wordfile):
name, ext = os.path.splitext(wordfile)
if ext != '.doc' and ext != '.docx':
return None
txtfile = name + '.txt'
print txtfile
wordapp.Documents.Open(os.path.abspath(wordfile))
wdFormatTextLineBreaks = 3
wordapp.ActiveDocument.SaveAs(os.path.abspath(txtfile),
FileFormat=wdFormatTextLineBreaks)
wordapp.ActiveDocument.Close()
return txtfile
# a generator that iterates all doc files in the current work dir
def next_doc():
for d in glob.glob('*.doc'):
yield d
for d in glob.glob('*.docx'):
yield d
# convert all doc/docx files and zip all output txt files as the zipfilename
def convert_and_zip(zipfilename):
word = Dispatch("Word.Application")
with ZipFile(zipfilename, 'w') as fzip:
for doc in next_doc():
print 'converting ', doc, '...'
txtfile = convert_to_text(word, doc)
if txtfile:
fzip.write(txtfile)
word.Quit()
|
I am getting an error on print command in this while compiling.. Any ideas?
Vivek, did you use python 2.7?
you can use online free converter like http://saaspose.com/api/words to convert your doc file to text files easily and accurately.