""" HTMLTextToPDF.py A demo program to show how to convert the text extracted from HTML content, to PDF. It uses the Beautiful Soup library, v4, to parse the HTML, and the xtopdf library to generate the PDF output. Beautiful Soup is at: http://www.crummy.com/software/BeautifulSoup/ xtopdf is at: https://bitbucket.org/vasudevram/xtopdf Guide to using and installing xtopdf: http://jugad2.blogspot.in/2012/07/guide-to-installing-and-using-xtopdf.html Author: Vasudev Ram - http://www.dancingbison.com Copyright 2015 Vasudev Ram """ import sys from bs4 import BeautifulSoup from PDFWriter import PDFWriter def usage(): sys.stderr.write("Usage: python " + sys.argv[0] + " html_file pdf_file\n") sys.stderr.write("which will extract only the text from html_file and\n") sys.stderr.write("write it to pdf_file\n") def main(): # Create some HTML for testing conversion of its text to PDF. html_doc = """ Test file for HTMLTextToPDF This is text within the body element but outside any paragraph.

This is a paragraph of text. Hey there, how do you do? The quick red fox jumped over the slow blue cow.

This is another paragraph of text. Don't mind what it contains. What is mind? Not matter. What is matter? Never mind.

This is also text within the body element but not within any paragraph. """ pw = PDFWriter("HTMLTextTo.pdf") pw.setFont("Courier", 10) pw.setHeader("Conversion of HTML text to PDF") pw.setFooter("Generated by xtopdf: http://slid.es/vasudevram/xtopdf") # Use method chaining this time. for line in BeautifulSoup(html_doc).get_text().split("\n"): pw.writeLine(line) pw.savePage() pw.close() if __name__ == '__main__': main()