source: trunk/gsdl/bin/script/pdftohtml.pl@ 1960

Last change on this file since 1960 was 1960, checked in by dg5, 23 years ago

Modified pdftohtml.pl to reflect the change in location of pdftohtml.bin file

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert documents to HTML ot TEXT format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 1999 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45print STDERR
46 ("pdftohtml version 0.22\n",
47 "Usage: pdftohtml [options] <PDF-file> [<html-file>]\n",
48 " -f <int> : first page to convert\n",
49 " -l <int> : last page to convert\n",
50 " -d <dir> : target directory (default: basename of pdf-file)\n",
51 " -o <file> : name of output file; - means stdout (default index.html)\n",
52 " -q : don't print any messages or errors\n",
53 " -h : print this usage information\n",
54 " -p : exchange .pdf links by .html\n",
55 " -c : generate complex HTML document\n",
56 " -F : don't use frames in HTML document\n",
57 " -i : ignore images\n",
58 " -e <string> : set extension for images (in the Html-file) (default png)\n"
59 );
60}
61
62sub main {
63 my (@ARGV) = @_;
64 my ($first,$last,$target_dir,$out_file,$img_ext,
65 $optq,$opth,$optp,$optc,$optF,$opti);
66
67 # read command-line arguments so that
68 # you can change the command in this script
69 if (!parsargv::parse(\@ARGV,
70 'f/\d+/1', \$first,
71 'l/\d+/1', \$last,
72 'd/[\S]*/', \$target_dir,
73 'o/[\S]*/', \$out_file,
74 'e/[\S]*/', \$img_ext,
75 'q', \$optq,
76 'h', \$opth,
77 'p', \$optp,
78 'c', \$optc,
79 'F', \$optF,
80 'i', \$opti
81 ))
82 {
83 print_usage();
84 }
85
86 # Make sure the input file exists and can be opened for reading
87 if (scalar(@ARGV!=2)) {
88 print_usage();
89 }
90
91 my $input_filename = $ARGV[0];
92 my $output_filestem = $ARGV[1];
93
94 my @dir = split (/(\/|\\)/, $input_filename);
95 pop(@dir);
96 my $dir = join ("", @dir);
97
98 if (!-r $input_filename) {
99 print STDERR "Error: unable to open $input_filename for reading\n";
100 exit(1);
101 }
102
103 # formulate the command
104 my ($p_home, $pdftohtml);
105
106 if ($ENV{'GSDLOS'} =~ /^windows/i) {
107 $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows");
108 $pdftohtml = &util::filename_cat($p_home, "pdftohtml.bin");
109 }
110 else {
111 $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "pdftohtml");
112 $pdftohtml = &util::filename_cat($p_home, "bin", "pdftohtml.bin");
113 }
114 return 0 unless (-e "$pdftohtml");
115
116 $cmd = "";
117 if ($timeout) {$cmd = "ulimit -t $timeout;";}
118 $cmd .= "$pdftohtml -noframes";
119 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
120 $cmd .= " >\"$output_filestem.out\" 2>\"$output_filestem.err\"";
121
122 if (system($cmd)>0) {
123 print STDERR "Error executing $cmd: $!\n";
124 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
125 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
126 return 0;
127 }
128
129 # Need to convert images from PPM format to PNG format
130 my @images;
131 open (IMAGES, "images.log");
132 while (<IMAGES>) {
133 push (@images, $_);
134 }
135 close IMAGES;
136
137 for $image (@images) {
138 chomp($image);
139 my $cmd = "";
140 if ($ENV{'GSDLOS'} =~ /^windows/i) {
141 $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows", "pnmtopng.exe");
142 $cmd .= " $image";
143 if (system($cmd)>0) {
144 print STDERR "Error executing $cmd\n";
145 return 0; # not sure about whether to leave this one in or take it out
146 }
147 } else {
148 my @nameparts = split(/\./, $image);
149 my $image_base = shift(@nameparts);
150
151 $cmd = "pnmtopng $image > $image_base.png";
152 if (system($cmd)>0) {
153 $cmd = "convert $image $image_base.png";
154 if (system($cmd)>0) {
155 print STDERR "Cannot convert $image into PNG format...\n";
156 return 0; # not sure about whether to leave this one in or take it out
157 }
158 }
159 }
160 &util::rm($image);
161 }
162
163 return 1;
164}
165
166&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.