#!/usr/bin/perl --
#
# parse pdf files for htdig
#
# - generate anchor tags
# - do site specific rewriting url to title
#   for missing or bad titles
# - I suppose it is faster then parse_doc.pl
#
# based on:
#	- htdig documentation
#	- parse_doc.pl
#   - parsepdf.pl (Stefan Nehlsen/sn@parlanet.de)
#
# 2005 Juergen Plate  plate@ee.fhm.edu

use strict;
use bytes;

#
# set this to your PDF to text converter or adjust path
#
my $parser = "/home/www/bin/pdftotext";  # From "xpdf" (http://www.foolabs.com/xpdf/)
if (! -x $parser) { $parser = "/home/www/bin/pstotext"; } # From "pstotext"
if (! -x $parser) { $parser = "/home/www/bin/ps2ascii"; } # From a ghostscript

my $text = '';
my $title = '';
my $header = '';
my $word = '';
my @words = ();
my $n = 0; 
my $page = 2; 
my $calc = 0;

$ENV{'LANG'}="en_US.ISO8859-1";

my($infile, $content_type, $url) = @ARGV;

# paranoid
die "parser \"$parser\" not executable!\n" unless -x $parser;
die "\"$infile\" not readable\n" unless -f $infile;
open PDF, $infile or die "opening $infile failed\n";
$text = <PDF>; # read first line
close PDF;
die "\"$infile is not a PDF-File!\n" unless $text=~/^%PDF-\d\.\d/;
# everything seems to be ok

# read text from pdftotext
undef $/;
open PDF, "$parser -raw -q \"$infile\" - 2>/dev/null |"
	or die "error opening pdf \"$infile\"\n";
$text = <PDF>; # read whole file
close PDF;

# the point of no return
($title = $url) =~ s|^.*/(.*?\.pdf$)|PDF Dokument $1|i;
$title =~ s/&/\&amp\;/g; $title =~ s/</\&lt\;/g; $title =~ s/>/\&gt\;/g;
print "t\t", $title, "\n";

# header = first line or first 512 characters
$header = substr($text,0,511);
($header,$word) = split/[\r\n].*/, $header;
if ($header)
  {
  $header =~ s/&/\&amp\;/g; $header =~ s/</\&lt\;/g; $header =~ s/>/\&gt\;/g;
  print "h\t", $header, "\n";
  }

$text =~ s/^[\s\n\r]*//s;  # delete leading blanks
$text =~ s/[\s\n\r]+/ /gs; # replace line breaks, delete subsequent blanks
# dehyphen
$text =~ s/([A-Za-z\x80-\xfe])-\s*([A-Za-z\x80-\xfe])/$1$2/gs;

# split into words (only 3 characters or more)
my @words = grep { /\f|.{3,}/ } split /[^A-Za-z\x80-\xff\f]+/, $text;

# print wordlist
foreach $word ( @words)
  {
  if ($word eq "\f") 
    { printf "a\tpage=%d\n", $page++; } 
  else 
    { 
    $calc=int(1000*$n/@words); # calculate rel. position (0-1000)
    printf ("w\t%s\t%d\t0\n", $word, $calc);
    $n++; 
    }
  }

