Welcome, guest | Sign In | My Account | Store | Cart
#! /usr/bin python
# -*- coding: utf-8 -*-
"""
@created: 2016-04-23 13:40:00

@author: Jorj X. McKie

Find all fonts used in a PDF or other document.
"""
from __future__ import print_function
import fitz                       # PyMuPDF
from lxml import etree            # or any other XML parser

doc = fitz.Document("<document>.<ext>pdf")         # all document types supported by MuPDF
fonts=[]
for i in range(doc.pageCount):
    xml = doc.getPageText(i, output="xml")         # get XML version of page's text
    try:
        root = etree.fromstring(xml.replace('c="&#x8;"', 'c=" "'))  # need to get rid of ctrl codes
    except:
        print ("invalid xml syntax on page", i)
        continue
    for b in root.findall("block"):
        for l in b.findall("line"):
            for s in l.findall("span"):
                ds = dict(s.items())
                size = 0
                if "font" in ds:
                    font=ds["font"]
                else:
                    continue
                if "size" in ds:
                    size = float(ds["size"])
                else:
                    size = 0.0
                if [i, font, size] not in fonts:
                    fonts.append([i, font, size])

# Now, all font occurrences are stored and can by output ...
# ... just as a simple list ...
print(" Font usage by page ".center(80, "="))
for f in fonts:
    print(f)

# ... or e.g. by font / size naming the pages where it occurs
print("\n")
pglist = {}
for f in fonts:
    fkey = "%s (%s)" % (f[1], str(f[2]))             # font + size
    if fkey in pglist.keys():
        if f[0] in pglist[fkey]:
            continue
        else:
            pglist[fkey].append(f[0])
    else:
        pglist[fkey] = [f[0]]

for f in pglist.keys():
    print(f, pglist[f])

History