Site Section:
Keywords:
The pyPDF package provides really nice facilities for PDF document manipulation. Here is a simple application script to extract a specified subset of pages from a PDF file.
#! /usr/bin/env python ############################################################################### ## ## Copyright 2012 Jeet Sukumaran. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program. If not, see <http://www.gnu.org/licenses/>. ## ############################################################################### """ Extract specified pages from source PDF. """ import sys import os import argparse import pyPdf __prog__ = os.path.basename(__file__) __version__ = "1.0.0" __description__ = __doc__ __author__ = 'Jeet Sukumaran' __copyright__ = 'Copyright (C) 2012 Jeet Sukumaran.' def main(): """ Main CLI handler. """ parser = argparse.ArgumentParser(description=__description__) parser.add_argument("--version", action="version", version="%(prog)s " + __version__) parser.add_argument("src_pdf", metavar="SOURCE-PDF", type=argparse.FileType('rb'), help="path to input pdf file") parser.add_argument("first_page", metavar="FIRST-PAGE", type=int, help="number of first page (1-based index: first page is '1')") parser.add_argument("last_page", metavar="LAST-PAGE", type=str, help="number of last page; if preceded by '+' (e.g., '+30'), specifies number of pages following first page to extract") parser.add_argument("-o", "--output-filepath", type=str, default=None, help="path to output file (if not given, will write to standard output)") args = parser.parse_args() first_page = args.first_page - 1 if args.last_page.startswith("+"): last_page = args.last_page[1:].replace(" ", "") if not last_page: sys.exit("Need to specify number of pages") last_page = first_page + int(last_page) else: last_page = int(args.last_page) - 1 pdf_in = pyPdf.PdfFileReader(args.src_pdf) pdf_out = pyPdf.PdfFileWriter() for pg_num in range(first_page, last_page + 1): pdf_out.addPage(pdf_in.getPage(pg_num)) if args.output_filepath: out_stream = open(os.path.expandvars(os.path.expanduser(args.output_filepath)), "wb") else: out_stream = sys.stdout pdf_out.write(out_stream) out_stream.close() if __name__ == '__main__': main()