#!/usr/bin/perl
########################################################################
#  This program uses ghostscript and its pbm driver to extract absolute 
#  bounding box information for any postscript file.
#
#  The output of the program is the bounding box for every page in the
#  document.
#
#  This program could certainly be made faster if written in a compiled
#  language, but the time spent within it is probably neglible compared 
#  to the time spent by ghostscript.
#
#  Bugs: 
#    The program may be confused by files that directly set the page
#    size.
#
#  Copyright (C) 2003 Dov Grobgeld <dov@imagic.weizmann.ac.il>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
# 
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
# 
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
# 
#  For more details see the file COPYING.
########################################################################

my $VERSION = "0.2";

use strict;
no utf8;

my $do_through_pbm = undef;  # Turn this on for old gs compatibility
my $GSPATH = "gs"; # Your path to gs
my($baseres, $pagesize, $maxpages, $evenodd, $insert, $pad);

while( $_ = $ARGV[0], /^-/) {
    shift;
    /-res/     && do { $baseres = shift; next }; 
    /-letter/  && do { $pagesize = 'letter'; next };
    /-a4/      && do { $pagesize = 'a4'; next; };
    /-pages/   && do { $maxpages = shift; next; };
    /-evenodd/ && do { $evenodd++; next; };
    /-insert/  && do { $insert++; next; };
    /-bypbm/   && do { $do_through_pbm++; next; };
    /-pad/     && do { $pad = shift; next; };
    /-help/    && do { print <<EOHELP; exit; };
$0 - Calculate the bounding boxes of all pages in a PostScript file

Syntax:
    $0 [-res r] [-letter|-a4] filename
    
Description:
    The program outputs the page number and the Bounding Box parameters
    for every page in the file filename, by postprocessing the output
    from the gs bbox device.

Options:
    -res res  Choose a resolution for ghostscript other than the default
              72 lpi.
    -letter   Choose paper size. Default is a4.
    -evenodd  Report separate document bounding boxes for odd and even pages
    -pages n  Set max number of pages to analyze to n.
    -insert   Edit the postscript file and change/insert a EPSF directive
              and a BoundingBox statement.
    -bypbm    Calculate bounding box by pbm device (obsolete).
    -pad p    Padd bounding box on all sides with p postscript points.

Requirements:
    Ghostscript in path with bbox or pbmraw support compiled in.

Author:
    Dov Grobgeld, Rehovot, Israel, 2003
EOHELP
    die "Unknown option $_!\n";
}

# defaults
$pagesize = 'a4' unless $pagesize;
$baseres = 72 unless $baseres;
$maxpages = 1_000_000 unless $maxpages;

# table of known paper sizes.
# perl5
# %papersizes = ( 'a4'=> [595,842], 'letter', [612,792]);
# ($pw,$ph) = @{$papersize{$pagesize}};

my %paperwidth =  ( 'a4', 595, 'letter', 612 );
my %paperheight = ( 'a4', 842, 'letter', 792 );

my $pw = $paperwidth{$pagesize};
my $ph = $paperheight{$pagesize};

# Calculate a resolution of about $baseres that gives the pixelwidth 

# document bounding box initialization
my $docllx=$pw;
my $doclly=$ph;
my $docurx=0;
my $docury=0;
my $odddocllx=$pw;
my $odddoclly=$ph;
my $odddocurx=0;
my $odddocury=0;
my $evendocllx=$pw;
my $evendoclly=$ph;
my $evendocurx=0;
my $evendocury=0;

# get filename
my $fn = shift || die "Expected postscript file name!\n";
-f $fn || die "No such file $fn!\n";

# Print header
print "    Page:   llx   lly   urx   ury\n";

$|++; # auto flush output
# Use the new ghostscript bbox device
if (!$do_through_pbm) {
    my $gsopt = "-q -dNOPAUSE -sPAPERSIZE=$pagesize -sDEVICE=bbox -sOutputFile=-";
    my $gscmd = "$GSPATH -r72 $gsopt -- $fn";
    open(GS, "$gscmd 2>&1|");
    my $page = 0;

    while(<GS>) {
	# Check if we are using an old ghostscript version
	if (/^Unknown device:/) {
	    $do_through_pbm++;
	}
	next unless /^\%\%BoundingBox:/;
	my($llx,$lly,$urx,$ury) = split(' ', $');

	$page++;
        printf "    %4d  %5.f %5.f %5.f %5.f\n", $page, $llx, $lly, $urx, $ury;
	next if $llx == 0 && $urx == 0 && $lly == 0 && $ury == 0;
	
        if ($pad) {
            $llx-=$pad;
            $lly-=$pad;
            $urx+=$pad;
            $ury+=$pad;
        }
    
        if ($evenodd) {
	    if ($page % 2) {  # odd
		$odddocllx=$llx if $llx<$odddocllx;
		$odddoclly=$lly if $lly<$odddoclly;
		$odddocurx=$urx if $urx>$odddocurx;
		$odddocury=$ury if $ury>$odddocury;
	    }
	    else {
		$evendocllx=$llx if $llx<$evendocllx;
		$evendoclly=$lly if $lly<$evendoclly;
		$evendocurx=$urx if $urx>$evendocurx;
		$evendocury=$ury if $ury>$evendocury;
	    }
        }
    	    
        $docllx=$llx if $llx<$docllx;
        $doclly=$lly if $lly<$doclly;
        $docurx=$urx if $urx>$docurx;
        $docury=$ury if $ury>$docury;
    
        if ($page >= $maxpages) {
            print "skipping rest of file...\n";
            last;
        }
	
    }
}

######################################################################
#  This is the old pbm routine that renders pages in pbm and then
#  parses the binary output.
######################################################################
if ($do_through_pbm) {
    # divisible by 8.
    my $w8 = int($pw / 72 * $baseres /8 + 0.5) * 8;
    my $res = 72.0*$w8/$pw;
    
    # create an empty row for optimization
    my $emptyrow = "\0" x ($w8/8);

    my $gsopt = "-q -dNOPAUSE -sPAPERSIZE=$pagesize -sDEVICE=pbmraw -sOutputFile=-";
    open(GS, "$GSPATH -r$res $gsopt -- $fn|");
    
    my $page=0;
    my $bot;
    my ($lmcand, $rmcand);
    while(!eof(GS)) {
        chop($_=<GS>);
        die "Expected P4 but got '$_'. \nProbably a PostScript error...\n" unless /P4/;
        while(<GS>) { last unless /^\#/; }
        my ($w,$h)=split(" ", $_);
        unless ($w8 == $w) {
	    warn "Warning! Expected bitmap width $w8 but got $w. Adjusting resolution...\n";
	    $res *= $w8 / $w;
	    close(GS);
            open(GS, "$GSPATH -r$res $gsopt -- $fn|");
	    next;
        }
        $page++;
        printf "    %4d  ", $page;
        my $topmarg = 0;
        my $botmarg = 0;
        my $leftmarg = $w;
        my $rightmarg = $w;
        my $top=1;
        for my $i (1..$h) {
    	read(GS, $_, $w/8 );
    
            # Check for an empty row
    	#if ($_ eq $emptyrow)  
    	if ($_=~ /^\00*$/) {
    	    if ($top) {   # Still scanning top margin?
    	        $topmarg++;
    	    }
    	    else {
    		$botmarg = 0 unless $bot; # reset the bottom margin
    	        $bot=1;
    	        $botmarg++;
    	    }
    	}
    	# Otherwise get the left and right margins of the row
    	else {
    	    $bot=0; $top=0; # Not counting top and bottom margins anymore
    
    	    # Get left margin of line
    	    /^(\00*)([^\00])/;
    	    $lmcand=length($1)*8;
    	    if ($lmcand < $leftmarg) {
    		($b=unpack('B*',$2))=~ /^0*/;
    		$lmcand+= length($&);
    		$leftmarg = $lmcand if $lmcand < $leftmarg;
    	    }
    
                # Get right margin of line
    	    /([^\00])(\00*)$/;
    	    $rmcand=length($2)*8;
    	    if ($rmcand < $rightmarg) {
    		($b = unpack('B*', $1))=~ /0*$/;
    		$rmcand+= length($&);
    		$rightmarg = $rmcand if ($rmcand < $rightmarg);
    	    }
    	}
        }
    
        # scale and translate to postscript points
        my $scale = 72/$res;
        my $ury= ($h-$topmarg)*$scale;
        my $lly= $botmarg*$scale;
        my $llx= $leftmarg*$scale;
        my $urx= ($w-$rightmarg)*$scale;
        $lly= $h*$scale if $lly==$ury && $lly==0;
        printf "%5.f %5.f %5.f %5.f\n", $llx, $lly, $urx, $ury;
    
        if ($pad) {
            $llx-=$pad;
            $lly-=$pad;
            $urx+=$pad;
            $ury+=$pad;
        }
    
        if ($evenodd) {
	    if ($page % 2) {  # odd
		$odddocllx=$llx if $llx<$odddocllx;
		$odddoclly=$lly if $lly<$odddoclly;
		$odddocurx=$urx if $urx>$odddocurx;
		$odddocury=$ury if $ury>$odddocury;
	    }
	    else {
		$evendocllx=$llx if $llx<$evendocllx;
		$evendoclly=$lly if $lly<$evendoclly;
		$evendocurx=$urx if $urx>$evendocurx;
		$evendocury=$ury if $ury>$evendocury;
	    }
        }
    	    
        $docllx=$llx if $llx<$docllx;
        $doclly=$lly if $lly<$doclly;
        $docurx=$urx if $urx>$docurx;
        $docury=$ury if $ury>$docury;
    
        if ($page >= $maxpages) {
            print "skipping rest of file...\n";
            last;
        }
    }
}

print "Document: ";
printf "%5.f %5.f %5.f %5.f\n", $docllx, $doclly, $docurx, $docury;
if ($evenodd) {
    print "Odd:      ";
    printf "%5.f %5.f %5.f %5.f\n", $odddocllx, $odddoclly, $odddocurx, $odddocury;
    print "Even:     ";
    printf "%5.f %5.f %5.f %5.f\n", $evendocllx, $evendoclly, $evendocurx, $evendocury;
}

if ($insert) {
    rename($fn, "$fn~");
    open(PSIN, "$fn~"); open(PSOUT, ">$fn");
    print PSOUT "%!PS-Adobe-1.0 EPSF-1.0\n";
    print PSOUT "%%BoundingBox: ";
    printf PSOUT "%5.f %5.f %5.f %5.f\n", $docllx, $doclly, $docurx, $docury;
    print PSOUT "%%Comment: Bounding box extracted by bboxx\n";
    print PSOUT "%%+:       A program by Dov Grobgeld 2003\n";
    
    while(<PSIN>) {
	next if $.==1 && /^%!/;
	next if /^%%BoundingBox: /;
	print PSOUT;
    }
}
