Welcome, guest | Sign In | My Account | Store | Cart
#! /usr/bin python
# -*- coding: utf-8 -*-
"""
@created: 2016-04-23 13:40:00
@updated: 1016-08-25 20:00:00
@author: Jorj X. McKie

Find all fonts used in a PDF.
"""
from __future__ import print_function
import fitz                       # PyMuPDF

doc = fitz.open("file.pdf")

for i in len(doc):
    fontlist = doc.getPageFontList(i)
    if fontlist:
        print("fonts used on page", i)
    for font in fontlist:
        print("xref=%s, gen=%s, type=%s, basefont=%s, name=%s" % (font[0], font[1], font[2], font[3], font[4]))

Diff to Previous Revision

--- revision 2 2016-04-25 10:44:32
+++ revision 3 2016-08-26 00:02:48
@@ -2,57 +2,19 @@
 # -*- coding: utf-8 -*-
 """
 @created: 2016-04-23 13:40:00
-
+@updated: 1016-08-25 20:00:00
 @author: Jorj X. McKie
 
-Find all fonts used in a PDF or other document.
+Find all fonts used in a PDF.
 """
 from __future__ import print_function
 import fitz                       # PyMuPDF
-from lxml import etree            # or any other XML parser
 
-doc = fitz.Document("<document>.<ext>pdf")         # all document types supported by MuPDF
-fonts=[]
-for i in range(doc.pageCount):
-    xml = doc.getPageText(i, output="xml")         # get XML version of page's text
-    try:
-        root = etree.fromstring(xml.replace('c="&#x8;"', 'c=" "'))  # need to get rid of ctrl codes
-    except:
-        print ("invalid xml syntax on page", i)
-        continue
-    for b in root.findall("block"):
-        for l in b.findall("line"):
-            for s in l.findall("span"):
-                ds = dict(s.items())
-                if "font" in ds:
-                    font=ds["font"]
-                else:
-                    continue
-                if "size" in ds:
-                    size = float(ds["size"])
-                else:
-                    size = 0.0
-                if [i, font, size] not in fonts:
-                    fonts.append([i, font, size])
+doc = fitz.open("file.pdf")
 
-# Now, all font occurrences are stored and can by output ...
-# ... just as a simple list ...
-print(" Font usage by page ".center(80, "="))
-for f in fonts:
-    print(f)
-
-# ... or e.g. by font / size naming the pages where it occurs
-print("\n")
-pglist = {}
-for f in fonts:
-    fkey = "%s (%s)" % (f[1], str(f[2]))             # font + size
-    if fkey in pglist.keys():
-        if f[0] in pglist[fkey]:
-            continue
-        else:
-            pglist[fkey].append(f[0])
-    else:
-        pglist[fkey] = [f[0]]
-
-for f in pglist.keys():
-    print(f, pglist[f])
+for i in len(doc):
+    fontlist = doc.getPageFontList(i)
+    if fontlist:
+        print("fonts used on page", i)
+    for font in fontlist:
+        print("xref=%s, gen=%s, type=%s, basefont=%s, name=%s" % (font[0], font[1], font[2], font[3], font[4]))

History