source: trunk/gsdl/bin/script/pdftohtml.pl@ 1997

Last change on this file since 1997 was 1997, checked in by dg5, 23 years ago

Modified gsConvert.pl and pdftohtml.pl to reflect moving of pdftohtml and wv
directories from packages/unix to packages

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
Line 
1#!/usr/bin/perl -w
2
3
4###########################################################################
5#
6# pdftohtml.pl -- convert documents to HTML ot TEXT format
7#
8# A component of the Greenstone digital library software
9# from the New Zealand Digital Library Project at the
10# University of Waikato, New Zealand.
11#
12# Copyright (C) 1999 New Zealand Digital Library Project
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30# pdftohtml.pl is a wrapper for running pdftohtml utility which converts
31# PDF documents to HTML, and converts images to PNG format for display in
32# the HTML pages generated
33
34BEGIN {
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39use parsargv;
40use util;
41use Cwd;
42use File::Basename;
43
44sub print_usage {
45print STDERR
46 ("pdftohtml version 0.22\n",
47 "Usage: pdftohtml [options] <PDF-file> [<html-file>]\n",
48 " -f <int> : first page to convert\n",
49 " -l <int> : last page to convert\n",
50 " -d <dir> : target directory (default: basename of pdf-file)\n",
51 " -o <file> : name of output file; - means stdout (default index.html)\n",
52 " -q : don't print any messages or errors\n",
53 " -h : print this usage information\n",
54 " -p : exchange .pdf links by .html\n",
55 " -c : generate complex HTML document\n",
56 " -F : don't use frames in HTML document\n",
57 " -i : ignore images\n",
58 " -e <string> : set extension for images (in the Html-file) (default png)\n"
59 );
60exit (1);
61}
62
63sub main {
64 my (@ARGV) = @_;
65 my ($first,$last,$target_dir,$out_file,$img_ext,
66 $optq,$opth,$optp,$optc,$optF,$opti);
67
68 # read command-line arguments so that
69 # you can change the command in this script
70 if (!parsargv::parse(\@ARGV,
71 'f/\d+/1', \$first,
72 'l/\d+/1', \$last,
73 'd/[\S]*/', \$target_dir,
74 'o/[\S]*/', \$out_file,
75 'e/[\S]*/', \$img_ext,
76 'q', \$optq,
77 'h', \$opth,
78 'p', \$optp,
79 'c', \$optc,
80 'F', \$optF,
81 'i', \$opti
82 ))
83 {
84 print_usage();
85 }
86
87 # Make sure the input file exists and can be opened for reading
88 if (scalar(@ARGV!=2)) {
89 print_usage();
90 }
91
92 my $input_filename = $ARGV[0];
93 my $output_filestem = $ARGV[1];
94
95 my @dir = split (/(\/|\\)/, $input_filename);
96 pop(@dir);
97 my $dir = join ("", @dir);
98
99 if (!-r $input_filename) {
100 print STDERR "Error: unable to open $input_filename for reading\n";
101 exit(1);
102 }
103
104 # formulate the command
105 my ($p_home, $pdftohtml);
106
107 if ($ENV{'GSDLOS'} =~ /^windows/i) {
108 $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows");
109 $pdftohtml = &util::filename_cat($p_home, "pdftohtml.bin");
110 }
111 else {
112 $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "pdftohtml");
113 $pdftohtml = &util::filename_cat($p_home, "bin", "pdftohtml.bin");
114 }
115 return 0 unless (-e "$pdftohtml");
116
117 $cmd = "";
118 if ($timeout) {$cmd = "ulimit -t $timeout;";}
119 $cmd .= "$pdftohtml -noframes";
120 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
121 $cmd .= " >\"$output_filestem.out\" 2>\"$output_filestem.err\"";
122
123 if (system($cmd)>0) {
124 print STDERR "Error executing $cmd: $!\n";
125 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
126 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
127 return 0;
128 }
129
130 # Need to convert images from PPM format to PNG format
131 my @images;
132 open (IMAGES, "images.log");
133 while (<IMAGES>) {
134 push (@images, $_);
135 }
136 close IMAGES;
137
138 for $image (@images) {
139 chomp($image);
140 my $cmd = "";
141 if ($ENV{'GSDLOS'} =~ /^windows/i) {
142 $cmd = &util::filename_cat($ENV{'GSDLHOME'}, "bin", "windows", "pnmtopng.exe");
143 $cmd .= " $image";
144 if (system($cmd)>0) {
145 print STDERR "Error executing $cmd\n";
146 return 0; # not sure about whether to leave this one in or take it out
147 }
148 } else {
149 my @nameparts = split(/\./, $image);
150 my $image_base = shift(@nameparts);
151
152 $cmd = "pnmtopng $image > $image_base.png";
153 if (system($cmd)>0) {
154 $cmd = "convert $image $image_base.png";
155 if (system($cmd)>0) {
156 print STDERR "Cannot convert $image into PNG format...\n";
157 return 0; # not sure about whether to leave this one in or take it out
158 }
159 }
160 }
161 &util::rm($image);
162 }
163
164 return 1;
165}
166
167&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.