Script 1: Extract ALL images ---------------------------- #! python ''' This demo extracts all images of a PDF as PNG files, whether they are referenced by pages or not. It scans through all objects and selects /Type/XObject with /Subtype/Image. So runtime is determined by number of objects and image volume. Usage: extract_img2.py input.pdf ''' from __future__ import print_function import fitz import sys, time, re checkXO = r"/Type(?= */XObject)" # finds "/Type/XObject" checkIM = r"/Subtype(?= */Image)" # finds "/Subtype/Image" if len(sys.argv) != 2: print('Usage: %s <input file>' % sys.argv[0]) exit(0) t0 = time.clock() doc = fitz.open(sys.argv[1]) imgcount = 0 lenXREF = doc._getXrefLength() # number of objects - do not use entry 0! # display some file info print("file: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF-1)) for i in range(1, lenXREF): # scan through all objects text = doc._getObjectString(i) # string defining the object isXObject = re.search(checkXO, text) # tests for XObject isImage = re.search(checkIM, text) # tests for Image if not isXObject or not isImage: # not an image object if not both True continue imgcount += 1 pix = fitz.Pixmap(doc, i) # make pixmap from image if pix.n < 5: # can be saved as PNG pix.writePNG("img-%s.png" % (i,)) else: # must convert the CMYK first pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG("img-%s.png" % (i,)) pix0 = None # free Pixmap resources pix = None # free Pixmap resources t1 = time.clock() print("run time", round(t1-t0, 2)) print("extracted images", imgcount) -------------------------------------------------------------------------------------------------- Script 2: Only extract page-referenced images --------------------------------------------- #! python ''' This demo extracts all images of a PDF as PNG files that are referenced by pages. Runtime is determined by number of pages and volume of stored images. Usage: extract_img1.py input.pdf ''' from __future__ import print_function import fitz import sys, time if len(sys.argv) != 2: print('Usage: %s <input file>' % sys.argv[0]) exit(0) t0 = time.clock() doc = fitz.open(sys.argv[1]) imgcount = 0 lenXREF = doc._getXrefLength() # display some file info print("file: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF-1)) for i in range(len(doc)): imglist = doc.getPageImageList(i) for img in imglist: xref = img[0] # xref number pix = fitz.Pixmap(doc, xref) # make pixmap from image imgcount += 1 if pix.n < 5: # can be saved as PNG pix.writePNG("p%s-%s.png" % (i, xref)) else: # must convert CMYK first pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG("p%s-%s.png" % (i, xref)) pix0 = None # free Pixmap resources pix = None # free Pixmap resources t1 = time.clock() print("run time", round(t1-t0, 2)) print("extracted images", imgcount) --------------------------------------------------------------------------------------------------