#! /usr/bin python
# -*- coding: utf-8 -*-
"""
@created: 2016-04-23 13:40:00
@updated: 1016-08-25 20:00:00
@author: Jorj X. McKie
Find all fonts used in a PDF.
"""
from __future__ import print_function
import fitz # PyMuPDF
doc = fitz.open("file.pdf")
for i in len(doc):
fontlist = doc.getPageFontList(i)
if fontlist:
print("fonts used on page", i)
for font in fontlist:
print("xref=%s, gen=%s, type=%s, basefont=%s, name=%s" % (font[0], font[1], font[2], font[3], font[4]))
Diff to Previous Revision
--- revision 2 2016-04-25 10:44:32
+++ revision 3 2016-08-26 00:02:48
@@ -2,57 +2,19 @@
# -*- coding: utf-8 -*-
"""
@created: 2016-04-23 13:40:00
-
+@updated: 1016-08-25 20:00:00
@author: Jorj X. McKie
-Find all fonts used in a PDF or other document.
+Find all fonts used in a PDF.
"""
from __future__ import print_function
import fitz # PyMuPDF
-from lxml import etree # or any other XML parser
-doc = fitz.Document("<document>.<ext>pdf") # all document types supported by MuPDF
-fonts=[]
-for i in range(doc.pageCount):
- xml = doc.getPageText(i, output="xml") # get XML version of page's text
- try:
- root = etree.fromstring(xml.replace('c=""', 'c=" "')) # need to get rid of ctrl codes
- except:
- print ("invalid xml syntax on page", i)
- continue
- for b in root.findall("block"):
- for l in b.findall("line"):
- for s in l.findall("span"):
- ds = dict(s.items())
- if "font" in ds:
- font=ds["font"]
- else:
- continue
- if "size" in ds:
- size = float(ds["size"])
- else:
- size = 0.0
- if [i, font, size] not in fonts:
- fonts.append([i, font, size])
+doc = fitz.open("file.pdf")
-# Now, all font occurrences are stored and can by output ...
-# ... just as a simple list ...
-print(" Font usage by page ".center(80, "="))
-for f in fonts:
- print(f)
-
-# ... or e.g. by font / size naming the pages where it occurs
-print("\n")
-pglist = {}
-for f in fonts:
- fkey = "%s (%s)" % (f[1], str(f[2])) # font + size
- if fkey in pglist.keys():
- if f[0] in pglist[fkey]:
- continue
- else:
- pglist[fkey].append(f[0])
- else:
- pglist[fkey] = [f[0]]
-
-for f in pglist.keys():
- print(f, pglist[f])
+for i in len(doc):
+ fontlist = doc.getPageFontList(i)
+ if fontlist:
+ print("fonts used on page", i)
+ for font in fontlist:
+ print("xref=%s, gen=%s, type=%s, basefont=%s, name=%s" % (font[0], font[1], font[2], font[3], font[4]))