source: trunk/gsdl/bin/script/gsConvert.pl@ 1452

Last change on this file since 1452 was 1445, checked in by paynter, 24 years ago

Replaced gs2html and gs2text with gsConvert.pl

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML ot TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. These are usually found in the
31# $GSDLHOME/packages directory.
32#
33# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34# conversion utilities. We can convery any file to text with a perl
35# implementation of the UNIX strings command.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41}
42
43use parsargv;
44use util;
45use Cwd;
46use File::Basename;
47
48
49sub print_usage
50{
51 print STDERR "Usage: $0 [-type doc|pdf] [-output html|text] filename\n";
52 exit(1);
53}
54
55
56sub main
57{
58 my (@ARGV) = @_;
59 my ($input_type,$output_type,$verbose);
60
61 # read command-line arguments
62 if (!parsargv::parse(\@ARGV,
63 'type/(doc|pdf)/', \$input_type,
64 'output/(html|text)/', \$output_type,
65 'verbose/\d+/0', \$verbose))
66 {
67 print_usage();
68 }
69
70 # Make sure the input file exists and can be opened for reading
71 if (scalar(@ARGV!=1)) {
72 print_usage();
73 }
74 my $input_filename = $ARGV[0];
75 if (!-r $input_filename) {
76 print STDERR "Error: unable to open $input_filename for reading\n";
77 exit(1);
78 }
79
80 # Deduce filenames
81 my ($tailname,$dirname,$suffix)
82 = File::Basename::fileparse($input_filename,'\..+');
83 my $output_filestem = &util::filename_cat($dirname,"$tailname");
84
85 if ($input_type eq "")
86 {
87 $input_type = substr($suffix,1,length($suffix)-1);
88 }
89
90 # Change to temporary working directory
91 my $stored_dir = cwd();
92 chdir ($dirname) || die "Unable to change to directory $dirname";
93
94 # Select convert utility
95 if (!defined $input_type) {
96 print STDERR "Error: No filename extension or input type defined\n";
97 exit(1);
98 }
99 elsif ($input_type eq "doc") {
100 print &convertDOC($input_filename, $output_filestem, $output_type);
101 print "\n";
102 }
103 elsif ($input_type eq "pdf") {
104 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
105 print "\n";
106 }
107 elsif ($input_type eq "ps") {
108 print &convertPS($input_filename, $output_filestem, $output_type);
109 print "\n";
110 }
111 else {
112 print STDERR "Error: Unable to convert type '$input_type'\n";
113 exit(1);
114 }
115
116 # restore to original working directory
117 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
118
119}
120
121&main(@ARGV);
122
123
124
125# Document-type conversion fucntions
126#
127# The following functions attempt to convert documents from their
128# input type to the specified output type. If no output type was
129# given, then they first attempt HTML, and then TEXT.
130#
131# Each returns the output type ("html" or "text") or "fail" if no
132# conversion is possible.
133
134# Convert a Microsoft word document
135
136sub convertDOC {
137 ($input_filename, $output_filestem, $output_type) = @_;
138
139 my $success = 0;
140
141 # Attempt specialised conversion to HTML
142 if (!$output_type || ($output_type =~ /html/i)) {
143 $success = &doc_to_html($input_filename, $output_filestem);
144 if ($success) {
145 return "html";
146 }
147 }
148
149 # Attempt simple conversion to HTML
150 if (!$output_type || ($output_type =~ /html/i)) {
151 $success = &any_to_html($input_filename, $output_filestem);
152 if ($success) {
153 return "html";
154 }
155 }
156
157 # Convert to text
158 if (!$output_type || ($output_type =~ /text/i)) {
159 $success = any_to_text($input_filename, $output_filestem);
160 if ($success) {
161 return "text";
162 }
163 }
164
165 return "fail";
166
167}
168
169
170# Convert an Adobe PDF document
171
172sub convertPDF {
173 ($dirname, $input_filename, $output_filestem, $output_type) = @_;
174
175 my $success = 0;
176
177 # Attempt conversion to HTML
178 if (!$output_type || ($output_type =~ /html/i)) {
179 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
180 if ($success) {
181 return "html";
182 }
183 }
184
185 # Attempt conversion to TEXT
186 if (!$output_type || ($output_type =~ /text/i)) {
187 $success = &pdf_to_text($input_filename, $output_filestem);
188 if ($success) {
189 return "text";
190 }
191 }
192
193 return "fail";
194
195}
196
197
198# Convert an Adobe PostScript document
199
200sub convertPS {
201 ($input_filename, $output_filestem, $output_type) = @_;
202
203 my $success = 0;
204
205 # Attempt conversion to TEXT
206 if (!$output_type || ($output_type =~ /text/i)) {
207 $success = &ps_to_text($input_filename, $output_filestem);
208 if ($success) {
209 return "text";
210 }
211 }
212
213 return "fail";
214
215}
216
217
218
219# Specific type-to-type cponversions
220#
221# Each of the following functions attempts to convert a document from
222# a specific format to another. If they succeed yhey return 1 and leave
223# the output document(s) in the appropriate place; if they fail they
224# return 0 and delete any working files.
225
226
227# Attempt to convert a word document to html with the wv program
228
229sub doc_to_html {
230 ($input_filename, $output_filestem) = @_;
231
232 # print "Processing...\n";
233
234 # formulate the command
235 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'},"packages","wv-0.5.44-gs");
236 my $wvHtml = &util::filename_cat($wv_home, "bin", "wvHtml");
237 return 0 unless (-e "$wvHtml");
238 $cmd = "$wvHtml \"$input_filename\" >\"$output_filestem.html\" 2>\"$output_filestem.err\"";
239
240 # execute the command
241 if (system($cmd)>0)
242 {
243 print STDERR "Error executing $cmd: $!. Continuing...\n";
244 }
245
246 # Was the conversion successful?
247 if (-e "$output_filestem.html") {
248 open(TMP, "$output_filestem.html");
249 $line = <TMP>;
250 close(TMP);
251 if ($line =~ /DOCTYPE HTML/) {
252 &util::rm("$output_filestem.err");
253 return 1;
254 } else {
255 # An error of some sort occurred
256 &util::rm("$output_filestem.html");
257 &util::rm("$output_filestem.err");
258 }
259 }
260 return 0;
261}
262
263
264# Convert a pdf file to html with the pdftohtml command
265
266sub pdf_to_html {
267 ($dirname, $input_filename, $output_filestem) = @_;
268
269 $cmd = "pdftohtml -F -d $dirname -o \"$output_filestem.html\" \"$input_filename\"";
270 $cmd .= " > $output_filestem.out";
271
272 if (system($cmd)>0)
273 {
274 print STDERR "Error executing $cmd: $!\n";
275 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
276 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
277 return 0;
278 }
279
280 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
281 return 1;
282}
283
284
285# Convert a PDF file to text with the pdftotext command
286
287sub pdf_to_text {
288 ($dirname, $input_filename, $output_filestem) = @_;
289
290 $cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
291 $cmd .= " 2> $output_filestem.err";
292
293 if (system($cmd)>0)
294 {
295 print STDERR "Error executing $cmd: $!\n";
296 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
297 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
298 return 0;
299 }
300
301 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
302 return 1;
303}
304
305
306# Convert a PostScript document to text with ps2ascii
307
308sub ps_to_text {
309 ($input_filename, $output_filestem) = @_;
310
311 my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
312 $cmd .= " 2> $output_filestem.err";
313
314 if (system($cmd)>0)
315 {
316 print STDERR "Error executing $cmd: $!\n";
317 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
318 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
319 return 0;
320 }
321
322 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
323 return 1;
324}
325
326
327# Convert any file to HTML with a crude perl implementation of the
328# UNIX strings command.
329
330sub any_to_html {
331 ($input_filename, $output_filestem) = @_;
332
333 # First generate a text file
334 return 0 unless (&any_to_text($input_filename, $output_filestem));
335
336 # create an HTML file from the text file
337 open(TEXT, "<$output_filestem.text");
338 open(HTML, ">$output_filestem.html");
339
340 print HTML '<html><head>
341<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
342<META NAME="GENERATOR" CONTENT="Greenstone any-to-html">
343</head><body>\n\n';
344 while (<TEXT>) {
345 print HTML "<p> ", $_;
346
347 }
348 print HTML "\n</body></html>]\n";
349
350 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
351 return 1;
352}
353
354# Convert any file to TEXT with a crude perl implementation of the
355# UNIX strings command.
356
357sub any_to_text {
358 ($input_filename, $output_filestem) = @_;
359
360 open(IN, "<$input_filename");
361 open(OUT, ">$output_filestem.text");
362
363 my ($line);
364 while (<IN>) {
365 $line = $_;
366
367 # delete anything that isn't a printable character
368 $line =~ s/[^\040-\176]+/\n/sg;
369
370 # delete any string less than 10 characters long
371 while ($line =~ /^[^\n]{1,9}$/m) {
372 $line =~ s/^[^\n]{0,9}$/\n/mg;
373 $line =~ s/\n+/\n/sg;
374 }
375
376 # remove extraneous whitespace
377 $line =~ s/\n+/\n/gs;
378 $line =~ s/^\n//gs;
379
380 # output whatever is left
381 if ($line =~ /[^\n]/) {
382 print OUT $line;
383 }
384 }
385 return 1;
386}
387
388
389
Note: See TracBrowser for help on using the repository browser.