Recipe 580703 revision 1 « ActiveState Code

Script 1: Extract ALL images
----------------------------
#! python
'''
This demo extracts all images of a PDF as PNG files, whether they are
referenced by pages or not.
It scans through all objects and selects /Type/XObject with /Subtype/Image.
So runtime is determined by number of objects and image volume.
Usage:
extract_img2.py input.pdf
'''
from __future__ import print_function
import fitz
import sys, time, re

checkXO = r"/Type(?= */XObject)"       # finds "/Type/XObject"   
checkIM = r"/Subtype(?= */Image)"      # finds "/Subtype/Image"

if len(sys.argv) != 2:
    print('Usage: %s <input file>' % sys.argv[0])
    exit(0)
    
t0 = time.clock()
doc = fitz.open(sys.argv[1])
imgcount = 0
lenXREF = doc._getXrefLength()         # number of objects - do not use entry 0!

# display some file info
print("file: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF-1))

for i in range(1, lenXREF):            # scan through all objects
    text = doc._getObjectString(i)     # string defining the object
    isXObject = re.search(checkXO, text)    # tests for XObject
    isImage   = re.search(checkIM, text)    # tests for Image
    if not isXObject or not isImage:   # not an image object if not both True
        continue
    imgcount += 1
    pix = fitz.Pixmap(doc, i)          # make pixmap from image
    if pix.n < 5:                      # can be saved as PNG
        pix.writePNG("img-%s.png" % (i,))
    else:                              # must convert the CMYK first
        pix0 = fitz.Pixmap(fitz.csRGB, pix)
        pix0.writePNG("img-%s.png" % (i,))
        pix0 = None                    # free Pixmap resources
    pix = None                         # free Pixmap resources
        
t1 = time.clock()
print("run time", round(t1-t0, 2))
print("extracted images", imgcount)
--------------------------------------------------------------------------------------------------

Script 2: Only extract page-referenced images
---------------------------------------------
#! python
'''
This demo extracts all images of a PDF as PNG files that are referenced
by pages.
Runtime is determined by number of pages and volume of stored images.
Usage:
extract_img1.py input.pdf
'''
from __future__ import print_function
import fitz
import sys, time

if len(sys.argv) != 2:
    print('Usage: %s <input file>' % sys.argv[0])
    exit(0)
    
t0 = time.clock()
doc = fitz.open(sys.argv[1])
imgcount = 0
lenXREF = doc._getXrefLength()

# display some file info
print("file: %s, pages: %s, objects: %s" % (sys.argv[1], len(doc), lenXREF-1))

for i in range(len(doc)):
    imglist = doc.getPageImageList(i)
    for img in imglist:
        xref = img[0]                  # xref number
        pix = fitz.Pixmap(doc, xref)   # make pixmap from image
        imgcount += 1
        if pix.n < 5:                  # can be saved as PNG
            pix.writePNG("p%s-%s.png" % (i, xref))
        else:                          # must convert CMYK first
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG("p%s-%s.png" % (i, xref))
            pix0 = None                # free Pixmap resources
        pix = None                     # free Pixmap resources

t1 = time.clock()
print("run time", round(t1-t0, 2))
print("extracted images", imgcount)
--------------------------------------------------------------------------------------------------
Recipe 580703 revision 1

History

Accounts

Code Recipes

Feedback & Information

ActiveState