Quantcast
Channel: Python
Viewing all articles
Browse latest Browse all 15

PDF Page Extraction/Selection in Python Using PyPDF

$
0
0

Site Section: 

Keywords: 

The pyPDF package provides really nice facilities for PDF document manipulation. Here is a simple application script to extract a specified subset of pages from a PDF file.

#! /usr/bin/env python
 
###############################################################################
##
##  Copyright 2012 Jeet Sukumaran.
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 3 of the License, or
##  (at your option) any later version.
##
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##
##  You should have received a copy of the GNU General Public License along
##  with this program. If not, see <http://www.gnu.org/licenses/>.
##
###############################################################################
 
"""
Extract specified pages from source PDF.
"""
 
import sys
import os
import argparse
import pyPdf
 
__prog__ = os.path.basename(__file__)
__version__ = "1.0.0"
__description__ = __doc__
__author__ = 'Jeet Sukumaran'
__copyright__ = 'Copyright (C) 2012 Jeet Sukumaran.'
 
def main():
    """
    Main CLI handler.
    """
 
    parser = argparse.ArgumentParser(description=__description__)
    parser.add_argument("--version", action="version", version="%(prog)s " + __version__)
    parser.add_argument("src_pdf",
            metavar="SOURCE-PDF",
            type=argparse.FileType('rb'),
            help="path to input pdf file")
    parser.add_argument("first_page",
            metavar="FIRST-PAGE",
            type=int,
            help="number of first page (1-based index: first page is '1')")
    parser.add_argument("last_page",
            metavar="LAST-PAGE",
            type=str,
            help="number of last page; if preceded by '+' (e.g., '+30'), specifies number of pages following first page to extract")
    parser.add_argument("-o", "--output-filepath",
            type=str,
            default=None,
            help="path to output file (if not given, will write to standard output)")
 
    args = parser.parse_args()
    first_page = args.first_page - 1
    if args.last_page.startswith("+"):
        last_page = args.last_page[1:].replace(" ", "")
        if not last_page:
            sys.exit("Need to specify number of pages")
        last_page = first_page + int(last_page)
    else:
        last_page = int(args.last_page) - 1
 
    pdf_in = pyPdf.PdfFileReader(args.src_pdf)
    pdf_out = pyPdf.PdfFileWriter()
    for pg_num in range(first_page, last_page + 1):
        pdf_out.addPage(pdf_in.getPage(pg_num))
    if args.output_filepath:
        out_stream = open(os.path.expandvars(os.path.expanduser(args.output_filepath)), "wb")
    else:
        out_stream = sys.stdout
    pdf_out.write(out_stream)
    out_stream.close()
 
if __name__ == '__main__':
    main()


Viewing all articles
Browse latest Browse all 15

Trending Articles