This is a very raw PDF converter which has absolutely no idea of the page layout or text positioning.
To install the required module try easy_install pypdf
in a console.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | import sys
import pyPdf
def getPDFContent(path):
content = ""
# Load PDF into pyPDF
pdf = pyPdf.PdfFileReader(file(path, "rb"))
# Iterate pages
for i in range(0, pdf.getNumPages()):
# Extract text from page and add to content
content += pdf.getPage(i).extractText() + " \n"
# Collapse whitespace
content = u" ".join(content.replace(u"\xa0", u" ").strip().split())
return content
f = open(sys.argv[1]+'.txt','w+')
f.write(getPDFContent(sys.argv[1]))
f.close()
#print getPDFContent(sys.argv[1]).encode("ascii", "xmlcharrefreplace")
|