#! /usr/bin python
# -*- coding: utf-8 -*-
"""
@created: 2016-04-23 13:40:00
@author: Jorj X. McKie
Find all fonts used in a PDF or other document.
"""
from __future__ import print_function
import fitz # PyMuPDF
from lxml import etree # or any other XML parser
doc = fitz.Document("<document>.<ext>pdf") # all document types supported by MuPDF
fonts=[]
for i in range(doc.pageCount):
xml = doc.getPageText(i, output="xml") # get XML version of page's text
try:
root = etree.fromstring(xml.replace('c=""', 'c=" "')) # need to get rid of ctrl codes
except:
print ("invalid xml syntax on page", i)
continue
for b in root.findall("block"):
for l in b.findall("line"):
for s in l.findall("span"):
ds = dict(s.items())
if "font" in ds:
font=ds["font"]
else:
continue
if "size" in ds:
size = float(ds["size"])
else:
size = 0.0
if [i, font, size] not in fonts:
fonts.append([i, font, size])
# Now, all font occurrences are stored and can by output ...
# ... just as a simple list ...
print(" Font usage by page ".center(80, "="))
for f in fonts:
print(f)
# ... or e.g. by font / size naming the pages where it occurs
print("\n")
pglist = {}
for f in fonts:
fkey = "%s (%s)" % (f[1], str(f[2])) # font + size
if fkey in pglist.keys():
if f[0] in pglist[fkey]:
continue
else:
pglist[fkey].append(f[0])
else:
pglist[fkey] = [f[0]]
for f in pglist.keys():
print(f, pglist[f])
Diff to Previous Revision
--- revision 1 2016-04-25 10:42:47
+++ revision 2 2016-04-25 10:44:32
@@ -24,7 +24,6 @@
for l in b.findall("line"):
for s in l.findall("span"):
ds = dict(s.items())
- size = 0
if "font" in ds:
font=ds["font"]
else: