#! /usr/bin python # -*- coding: utf-8 -*- """ @created: 2016-04-23 13:40:00 @author: Jorj X. McKie Find all fonts used in a PDF or other document. """ from __future__ import print_function import fitz # PyMuPDF from lxml import etree # or any other XML parser doc = fitz.Document("<document>.<ext>pdf") # all document types supported by MuPDF fonts=[] for i in range(doc.pageCount): xml = doc.getPageText(i, output="xml") # get XML version of page's text try: root = etree.fromstring(xml.replace('c=""', 'c=" "')) # need to get rid of ctrl codes except: print ("invalid xml syntax on page", i) continue for b in root.findall("block"): for l in b.findall("line"): for s in l.findall("span"): ds = dict(s.items()) size = 0 if "font" in ds: font=ds["font"] else: continue if "size" in ds: size = float(ds["size"]) else: size = 0.0 if [i, font, size] not in fonts: fonts.append([i, font, size]) # Now, all font occurrences are stored and can by output ... # ... just as a simple list ... print(" Font usage by page ".center(80, "=")) for f in fonts: print(f) # ... or e.g. by font / size naming the pages where it occurs print("\n") pglist = {} for f in fonts: fkey = "%s (%s)" % (f[1], str(f[2])) # font + size if fkey in pglist.keys(): if f[0] in pglist[fkey]: continue else: pglist[fkey].append(f[0]) else: pglist[fkey] = [f[0]] for f in pglist.keys(): print(f, pglist[f])