source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24371

Last change on this file since 24371 was 24371, checked in by ak19, 13 years ago

Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.9 KB
RevLine 
[1445]1#!/usr/bin/perl -w
2
3###########################################################################
4#
[2032]5# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
[3013]11# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]33#
[3013]34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]38#
[3013]39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
[2032]42# We try to convert Postscript files to text using "gs" which is often on
[2755]43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
[1445]45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
[22429]51use strict;
52
[1445]53use parsargv;
54use util;
55use Cwd;
56
[2755]57# Are we running on WinNT or Win2000 (or later)?
58my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]60
[3350]61my $use_strings;
[3720]62my $pdf_complex;
[4103]63my $pdf_nohidden;
[3720]64my $pdf_zoom;
65my $pdf_ignore_images;
[10451]66my $pdf_allow_images_only;
[10282]67my $windows_scripting;
[3350]68
[1445]69sub print_usage
70{
[1970]71 print STDERR "\n";
72 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73 print STDERR " or text using third-party programs.\n\n";
74 print STDERR " usage: $0 [options] filename\n";
[22642]75 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
[2755]76 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]77 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
[2755]78 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]79 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]80 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]81 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]82 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]83 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84 print STDERR "\t\tconverting PDF to HTML\n";
[10451]85 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]86 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87 print STDERR "\t\t-pdf_complex is set\n";
[1445]88 exit(1);
89}
90
[2755]91my $faillogfile="";
[3538]92my $timeout=0;
[1445]93
94sub main
95{
96 my (@ARGV) = @_;
[3538]97 my ($input_type,$output_type,$verbose);
[1960]98
[23473]99 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
100 # is in use or not
101 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
102 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
103 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
104 # Currently only have VBA for Word and PPT(but no XLS)
105 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
106
107 my $type_re = $default_type_re;
108
109 foreach my $a (@ARGV) {
110 if ($a =~ m/^windows_scripting$/i) {
111 $type_re = $enhanced_type_re;
112 }
113 }
114
[1445]115 # read command-line arguments
116 if (!parsargv::parse(\@ARGV,
[23473]117 "type/$type_re/", \$input_type,
[2755]118 '/errlog/.*/', \$faillogfile,
[22596]119 'output/(auto|html|text|pagedimg).*/', \$output_type,
[1692]120 'timeout/\d+/0',\$timeout,
[10282]121 'verbose/\d+/0', \$verbose,
[22429]122 'windows_scripting',\$windows_scripting,
[3720]123 'use_strings', \$use_strings,
124 'pdf_complex', \$pdf_complex,
[9482]125 'pdf_ignore_images', \$pdf_ignore_images,
[10451]126 'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]127 'pdf_nohidden', \$pdf_nohidden,
[3720]128 'pdf_zoom/\d+/2', \$pdf_zoom
129 ))
[1445]130 {
131 print_usage();
132 }
[12704]133
[1445]134 # Make sure the input file exists and can be opened for reading
135 if (scalar(@ARGV!=1)) {
136 print_usage();
137 }
[1928]138
[1445]139 my $input_filename = $ARGV[0];
140 if (!-r $input_filename) {
141 print STDERR "Error: unable to open $input_filename for reading\n";
142 exit(1);
143 }
144
145 # Deduce filenames
146 my ($tailname,$dirname,$suffix)
[2241]147 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
148 my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]149
150 if ($input_type eq "")
151 {
[2241]152 $input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]153 }
154
155 # Change to temporary working directory
156 my $stored_dir = cwd();
157 chdir ($dirname) || die "Unable to change to directory $dirname";
[10357]158
[1445]159 # Select convert utility
160 if (!defined $input_type) {
161 print STDERR "Error: No filename extension or input type defined\n";
162 exit(1);
163 }
[23473]164 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
[1445]165 print &convertDOC($input_filename, $output_filestem, $output_type);
166 print "\n";
167 }
[1684]168 elsif ($input_type eq "rtf") {
169 print &convertRTF($input_filename, $output_filestem, $output_type);
170 print "\n";
171 }
[1445]172 elsif ($input_type eq "pdf") {
173 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
174 print "\n";
175 }
176 elsif ($input_type eq "ps") {
[22429]177 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]178 print "\n";
179 }
[23473]180 elsif ($input_type =~ m/pptx?$/) {
[2977]181 print &convertPPT($input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
[23473]184 elsif ($input_type =~ m/xlsx?$/) {
[2991]185 print &convertXLS($input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
[1445]188 else {
189 print STDERR "Error: Unable to convert type '$input_type'\n";
190 exit(1);
191 }
192
193 # restore to original working directory
194 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
195
196}
197
198&main(@ARGV);
199
200
201
[2241]202# Document-type conversion functions
[1445]203#
204# The following functions attempt to convert documents from their
205# input type to the specified output type. If no output type was
206# given, then they first attempt HTML, and then TEXT.
207#
208# Each returns the output type ("html" or "text") or "fail" if no
209# conversion is possible.
210
211# Convert a Microsoft word document
212
213sub convertDOC {
[22429]214 my ($input_filename, $output_filestem, $output_type) = @_;
[1445]215
[1654]216 # Many .doc files are not in fact word documents!
217 my $realtype = &find_docfile_type($input_filename);
218
[23473]219 if ($realtype eq "word6" || $realtype eq "word7"
220 || $realtype eq "word8" || $realtype eq "docx") {
[1654]221 return &convertWord678($input_filename, $output_filestem, $output_type);
222 } elsif ($realtype eq "rtf") {
223 return &convertRTF($input_filename, $output_filestem, $output_type);
224 } else {
225 return &convertAnything($input_filename, $output_filestem, $output_type);
226 }
227}
228
229# Convert a Microsoft word 6/7/8 document
230
231sub convertWord678 {
[22429]232 my ($input_filename, $output_filestem, $output_type) = @_;
[1654]233
[1445]234 my $success = 0;
[16435]235 if (!$output_type || ($output_type =~ m/html/i)){
[10282]236 if ($windows_scripting) {
237 $success = &native_doc_to_html($input_filename, $output_filestem);
238 }
239 else {
240 $success = &doc_to_html($input_filename, $output_filestem);
241 }
[1445]242 if ($success) {
[10282]243 return "html";
[1445]244 }
245 }
[1654]246 return &convertAnything($input_filename, $output_filestem, $output_type);
247}
248
249
250# Convert a Rich Text Format (RTF) file
251
252sub convertRTF {
[22429]253 my ($input_filename, $output_filestem, $output_type) = @_;
[1654]254
255 my $success = 0;
256
257 # Attempt specialised conversion to HTML
[16435]258 if (!$output_type || ($output_type =~ m/html/i)) {
[12704]259
260 if ($windows_scripting) {
261 $success = &native_doc_to_html($input_filename, $output_filestem);
262 }
263 else {
264 $success = &rtf_to_html($input_filename, $output_filestem);
265 }
[1654]266 if ($success) {
267 return "html";
268 }
269 }
270
[2755]271# rtf is so ugly that's it's not worth running strings over.
272# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
273# return &convertAnything($input_filename, $output_filestem, $output_type);
274 return "fail";
[1654]275}
276
277
278# Convert an unidentified file
279
280sub convertAnything {
[22429]281 my ($input_filename, $output_filestem, $output_type) = @_;
[1654]282
283 my $success = 0;
[10464]284
[1445]285 # Attempt simple conversion to HTML
[16435]286 if (!$output_type || ($output_type =~ m/html/i)) {
[1445]287 $success = &any_to_html($input_filename, $output_filestem);
288 if ($success) {
289 return "html";
290 }
291 }
292
293 # Convert to text
[16435]294 if (!$output_type || ($output_type =~ m/text/i)) {
[2241]295 $success = &any_to_text($input_filename, $output_filestem);
[1445]296 if ($success) {
297 return "text";
298 }
299 }
300 return "fail";
301}
302
303
[1654]304
[1445]305# Convert an Adobe PDF document
306
307sub convertPDF {
[2755]308 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]309
310 my $success = 0;
[10357]311 $output_type =~ s/.*\-(.*)/$1/i;
312 # Attempt coversion to Image
[16435]313 if ($output_type =~ m/jp?g|gif|png/i) {
[17329]314 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]315 if ($success){
316 return "item";
317 }
318 }
[1445]319
320 # Attempt conversion to HTML
[16435]321 if (!$output_type || ($output_type =~ m/html/i)) {
[1445]322 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
323 if ($success) {
324 return "html";
325 }
326 }
327
328 # Attempt conversion to TEXT
[16435]329 if (!$output_type || ($output_type =~ m/text/i)) {
[2117]330 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]331 if ($success) {
332 return "text";
333 }
334 }
335
336 return "fail";
337
338}
339
340
341# Convert an Adobe PostScript document
342
343sub convertPS {
[22429]344 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]345
346 my $success = 0;
[10534]347 $output_type =~ s/.*\-(.*)/$1/i;
348 # Attempt coversion to Image
[16435]349 if ($output_type =~ m/jp?g|gif|png/i) {
[17329]350 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]351 if ($success){
352 return "item";
353 }
354 }
[1445]355
356 # Attempt conversion to TEXT
[16435]357 if (!$output_type || ($output_type =~ m/text/i)) {
[1445]358 $success = &ps_to_text($input_filename, $output_filestem);
359 if ($success) {
360 return "text";
361 }
362 }
363 return "fail";
364}
365
366
[2977]367sub convertPPT {
368 my ($input_filename, $output_filestem, $output_type) = @_;
[10357]369 my $success = 0;
[2977]370
[10282]371 my $ppt_convert_type = "";
[22513]372
[16435]373 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
374 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
375 if ($output_type =~ m/gif/i) {
[10282]376 $ppt_convert_type = "-g";
[16435]377 } elsif ($output_type =~ m/jp?g/i){
[10282]378 $ppt_convert_type = "-j";
[16435]379 } elsif ($output_type =~ m/png/i){
[10282]380 $ppt_convert_type = "-p";
381 }
382 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
383 $ENV{'GSDLOS'}, "pptextract");
[16435]384 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]385
[22429]386 my $cmd = "";
[10357]387 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]388 # if the converting directory already exists
[10282]389 if (-d $output_filestem) {
[22429]390 print STDERR "**The conversion directory already exists\n";
[10282]391 return "item";
392 } else {
[10521]393 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]394 $cmd .= " 2>\"$output_filestem.err\""
[16435]395 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[10282]396 if (system($cmd) !=0) {
397 print STDERR "Powerpoint VB Scripting convert failed\n";
398 } else {
399 return "item";
400 }
401 }
[16435]402 } elsif (!$output_type || ($output_type =~ m/html/i)) {
[10282]403 # Attempt conversion to HTML
[16435]404 #if (!$output_type || ($output_type =~ m/html/i)) {
[2977]405 # formulate the command
[22429]406 my $cmd = "";
[24362]407 my $full_perl_path = &util::get_perl_exec();
[24124]408 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]409 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410 $cmd .= " 2>\"$output_filestem.err\""
[16435]411 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[10357]412
[2977]413 # execute the command
414 $!=0;
415 if (system($cmd)!=0)
416 {
[2991]417 print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]418 } else {
419 return "html";
420 }
[10464]421 }
[2977]422
423 $success = &any_to_text($input_filename, $output_filestem);
424 if ($success) {
425 return "text";
426 }
[10464]427
[2977]428 return "fail";
429}
430
431
[2991]432sub convertXLS {
433 my ($input_filename, $output_filestem, $output_type) = @_;
[2977]434
[2991]435 my $success = 0;
[2977]436
[2991]437 # Attempt conversion to HTML
[16435]438 if (!$output_type || ($output_type =~ m/html/i)) {
[2991]439 # formulate the command
[22429]440 my $cmd = "";
[24362]441 my $full_perl_path = &util::get_perl_exec();
[24124]442 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]443 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
444 $cmd .= " 2>\"$output_filestem.err\""
[16435]445 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[2991]446
447
448 # execute the command
449 $!=0;
450 if (system($cmd)!=0)
451 {
452 print STDERR "Excel 95/97 converter failed $!\n";
453 } else {
454 return "html";
455 }
456 }
[2977]457
[2991]458 $success = &any_to_text($input_filename, $output_filestem);
459 if ($success) {
460 return "text";
461 }
462
463 return "fail";
464}
465
466
467
[1654]468# Find the real type of a .doc file
469#
[2012]470# We seem to have a lot of files with a .doc extension that are .rtf
[1654]471# files or Word 5 files. This function attempts to tell the difference.
472sub find_docfile_type {
[22429]473 my ($input_filename) = @_;
[23473]474
475 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
476 return "docx";
477 }
478
[1654]479 open(CHK, "<$input_filename");
[1734]480 binmode(CHK);
[1654]481 my $line = "";
482 my $first = 1;
483
484 while (<CHK>) {
485
486 $line = $_;
[1960]487
[1654]488 if ($first) {
489 # check to see if this is an rtf file
[16435]490 if ($line =~ m/^\{\\rtf/) {
[1654]491 close(CHK);
492 return "rtf";
493 }
[2755]494 $first = 0;
[1654]495 }
496
[1734]497 # is this is a word 6/7/8 document?
[16435]498 if ($line =~ m/Word\.Document\.([678])/) {
[1654]499 close(CHK);
[23473]500
[1734]501 return "word$1";
[1654]502 }
503
504 }
505
506 return "unknown";
507}
508
509
[1734]510# Specific type-to-type conversions
[1445]511#
512# Each of the following functions attempts to convert a document from
[2755]513# a specific format to another. If they succeed they return 1 and leave
[1445]514# the output document(s) in the appropriate place; if they fail they
515# return 0 and delete any working files.
516
517
518# Attempt to convert a word document to html with the wv program
519sub doc_to_html {
[22429]520 my ($input_filename, $output_filestem) = @_;
[1445]521
[24371]522 my $wvware_status = 0;
[1928]523
[24371]524 # need to ensure that the path to perl is quoted (in case there's spaces in it)
525 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout";
[15120]526
[24371]527# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]528
[24371]529 $wvware_status = system($launch_cmd)/256;
530 return $wvware_status;
[1445]531}
532
[10282]533# Attempt to convert a word document to html with the word2html scripting program
534sub native_doc_to_html {
[22429]535 my ($input_filename, $output_filestem) = @_;
[1445]536
[24166]537 # build up the path to the doc-to-html conversion tool we're going to use
538 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]539
[24164]540 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]541 # if windows scripting with docx input, use new VBscript to get the local Word install (if
542 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
543
544 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
545 # else script launch fails when there are error msgs
546 $vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
547 $vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
[24169]548 # //Nologo flag avoids Microsoft's opening/logo msgs
549 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
550 print STDERR " This may take some time. Please wait...\n";
[24166]551 }
552 else { # old doc versions. use the usual VB executable word2html for the
553 # conversion. Doesn't need full path, since bin\windows is on PATH
554 $vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
555 }
556 }
557 else { # not windows
558 $vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
[24164]559 }
560
[10445]561 if (-e "$output_filestem.html") {
[22429]562 print STDERR " The conversion file:\n";
563 print STDERR " $output_filestem.html\n";
564 print STDERR " ... already exists. Skipping\n";
[10445]565 return 1;
566 }
[10282]567
568 my $cmd = "";
569 if ($timeout) {$cmd = "ulimit -t $timeout;";}
570 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]571 #$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]572 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]573
[10282]574 # redirecting STDERR
[24166]575
576 $cmd .= " 2> \"$output_filestem.err\""
577 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
578 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
579
[10282]580 # execute the command
581 $!=0;
582 if (system($cmd)!=0)
583 {
[24164]584 print STDERR "Error executing $vbScript converter:$!\n";
[10282]585 if (-s "$output_filestem.err") {
586 open (ERRFILE, "<$output_filestem.err");
[24166]587
[10282]588 my $write_to_fail_log=0;
589 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
590 {$write_to_fail_log=1;}
591
592 my $line;
593 while ($line=<ERRFILE>) {
[16435]594 if ($line =~ m/\w/) {
[10282]595 print STDERR "$line";
596 print FAILLOG "$line" if ($write_to_fail_log);
597 }
598 if ($line !~ m/startup error/) {next;}
599 print STDERR " (given an invalid .DOC file?)\n";
600 print FAILLOG " (given an invalid .DOC file?)\n"
601 if ($write_to_fail_log);
602
603 } # while ERRFILE
604 close FAILLOG if ($write_to_fail_log);
605 }
606 return 0; # we can try any_to_text
607 }
608
609 # Was the conversion successful?
610 if (-s "$output_filestem.html") {
611 open(TMP, "$output_filestem.html");
[22429]612 my $line = <TMP>;
[10282]613 close(TMP);
[22429]614 if ($line && $line =~ m/html/i) {
[10282]615 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
616 return 1;
617 }
618 }
619
620 # If here, an error of some sort occurred
621 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
622 if (-e "$output_filestem.err") {
623 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
624 open (ERRLOG,"$output_filestem.err");
625 while (<ERRLOG>) {print FAILLOG $_;}
626 close FAILLOG;
627 close ERRLOG;
628 }
629 &util::rm("$output_filestem.err");
630 }
631 return 0;
632}
633
[1654]634# Attempt to convert an RTF document to html with rtftohtml
635sub rtf_to_html {
[2241]636 my ($input_filename, $output_filestem) = @_;
[1654]637
638 # formulate the command
[22429]639 my $cmd = "";
[1692]640 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]641 $cmd .= "rtftohtml";
[10282]642 #$cmd .= "rtf-converter";
[1654]643
[3246]644 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]645
646 $cmd .= " 2>\"$output_filestem.err\""
[16435]647 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[2574]648
649
[1654]650 # execute the command
[2755]651 $!=0;
[2060]652 if (system($cmd)!=0)
[1654]653 {
[2755]654 print STDERR "Error executing rtf converter $!\n";
[2656]655 # don't currently bother printing out error log...
656 # keep going, in case it still created an HTML file...
[1654]657 }
658
659 # Was the conversion successful?
[2755]660 my $was_successful=0;
[2656]661 if (-s "$output_filestem.html") {
[2755]662 # make sure we have some content other than header
663 open (HTML, "$output_filestem.html"); # what to do if fail?
664 my $line;
665 my $past_header=0;
666 while ($line=<HTML>) {
667
668 if ($past_header == 0) {
[16435]669 if ($line =~ m/<body>/) {$past_header=1;}
[2755]670 next;
671 }
672
673 $line =~ s/<[^>]+>//g;
[16435]674 if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]675 $was_successful=1;
676 last;
677 }
678 }
679 close HTML;
[1654]680 }
[2574]681
[2755]682 if ($was_successful) {
683 &util::rm("$output_filestem.err")
684 if (-e "$output_filestem.err");
685 # insert the (modified) table of contents, if it exists.
686 if (-e "${output_filestem}_ToC.html") {
687 &util::mv("$output_filestem.html","$output_filestem.src");
688 my $open_failed=0;
689 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
690 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
691 open HTML, ">$output_filestem.html" || ++$open_failed;
692
693 if ($open_failed) {
694 close HTMLSRC;
695 close TOC;
696 close HTML;
697 &util::mv("$output_filestem.src","$output_filestem.html");
698 return 1;
699 }
700
701 # print out header info from src html.
[16435]702 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]703 print HTML "$_";
704 }
705
706 # print out table of contents, making links relative
707 <TOC>; <TOC>; # ignore first 2 lines
708 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
709 my $line;
710 while ($line=<TOC>) {
[22429]711 $line =~ s@</body></html>$@@i ; # only last line has this
[2755]712 # make link relative
[22429]713 $line =~ s@href=\"[^\#]+@href=\"@i;
[2755]714 print HTML $line;
715 }
716 close TOC;
717
718 # rest of html src
719 while (<HTMLSRC>) {
720 print HTML $_;
721 }
722 close HTMLSRC;
723 close HTML;
724
725 &util::rm("${output_filestem}_ToC.html");
726 &util::rm("${output_filestem}.src");
727 }
728 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
729 return 1; # success
730 }
731
732 if (-e "$output_filestem.err") {
733 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
734 {
735 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]736 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]737 print FAILLOG " (rtf file might be too recent):\n";
738 open (ERRLOG, "$output_filestem.err");
739 while (<ERRLOG>) {print FAILLOG $_;}
740 close ERRLOG;
741 close FAILLOG;
742 }
743 &util::rm("$output_filestem.err");
744 }
745
[2656]746 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
747
[1654]748 return 0;
749}
750
751
[1445]752# Convert a pdf file to html with the pdftohtml command
753
754sub pdf_to_html {
[2755]755 my ($dirname, $input_filename, $output_filestem) = @_;
[1445]756
[22429]757 my $cmd = "";
[1692]758 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]759 my $full_perl_path = &util::get_perl_exec();
[24124]760 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]761 $cmd .= " -c" if ($pdf_complex);
762 $cmd .= " -i" if ($pdf_ignore_images);
[10451]763 $cmd .= " -a" if ($pdf_allow_images_only);
[4103]764 $cmd .= " -hidden" unless ($pdf_nohidden);
[1928]765 $cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]766
[16435]767 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
[2755]768 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
769 } else {
770 $cmd .= " > \"$output_filestem.err\"";
771 }
772
[2117]773 $!=0;
[2241]774
[2656]775 my $retval=system($cmd);
776 if ($retval!=0)
[1445]777 {
[2755]778 print STDERR "Error executing pdftohtml.pl";
[2117]779 if ($!) {print STDERR ": $!";}
780 print STDERR "\n";
[1445]781 }
782
[1692]783 # make sure the converter made something
[2656]784 if ($retval!=0 || ! -s "$output_filestem.html")
[1692]785 {
786 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]787 # print out the converter's std err, if any
788 if (-s "$output_filestem.err") {
[1692]789 open (ERRLOG, "$output_filestem.err") || die "$!";
[2755]790 print STDERR "pdftohtml error log:\n";
[1692]791 while (<ERRLOG>) {
792 print STDERR "$_";
793 }
794 close ERRLOG;
795 }
[22513]796 print STDERR "***********output filestem $output_filestem.html\n";
[2656]797 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]798 if (-e "$output_filestem.err") {
799 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
800 {
801 open (ERRLOG, "$output_filestem.err");
802 while (<ERRLOG>) {print FAILLOG $_;}
803 close ERRLOG;
804 close FAILLOG;
805 }
[10282]806 &util::rm("$output_filestem.err");
[2755]807 }
[1692]808 return 0;
809 }
[10357]810
811 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
812 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
813 return 1;
814}
815
816# Convert a pdf file to various types of image with the convert command
817
[17329]818sub pdfps_to_img {
[10357]819 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]820
821 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
822 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
823 my $result = `identify 2>&1`;
824 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
825 #ImageMagick is not installed, thus the convert utility is not available.
[17329]826 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]827 return 0;
828 }
829 }
830
[22429]831 my $cmd = "";
[10357]832 if ($timeout) {$cmd = "ulimit -t $timeout;";}
833 $output_type =~ s/.*\_(.*)/$1/i;
[24362]834 my $full_perl_path = &util::get_perl_exec();
[24124]835 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]836 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
[10357]837 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
838 } else {
839 $cmd .= " > \"$output_filestem.err\"";
840 }
841
842 # don't include path on windows (to avoid having to play about
843 # with quoting when GSDLHOME might contain spaces) but assume
844 # that the PATH is set up correctly
845 $!=0;
846 my $retval=system($cmd);
847 if ($retval!=0)
848 {
[10401]849 print STDERR "Error executing pdftoimg.pl";
[10357]850 if ($!) {print STDERR ": $!";}
851 print STDERR "\n";
852 }
853
854 #make sure the converter made something
855 #if ($retval !=0) || ! -s "$output_filestem")
856 if ($retval !=0)
857 {
858 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
859 #print out the converter's std err, if any
860 if (-s "$output_filestem.err") {
861 open (ERRLOG, "$output_filestem.err") || die "$!";
[17329]862 print STDERR "pdfpstoimg error log:\n";
[10357]863 while (<ERRLOG>) {
864 print STDERR "$_";
865 }
866 close ERRLOG;
867 }
[10534]868 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]869 if (-e "$output_filestem.err") {
870 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
871 {
872 open (ERRLOG, "$output_filestem.err");
873 while (<ERRLOG>) {print FAILLOG $_;}
874 close ERRLOG;
875 close FAILLOG;
876 }
877 &util::rm("$output_filestem.err");
878 }
879 return 0;
880 }
[2656]881 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]882 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
883 return 1;
884}
885
886# Convert a PDF file to text with the pdftotext command
887
888sub pdf_to_text {
[2755]889 my ($dirname, $input_filename, $output_filestem) = @_;
[1445]890
[2248]891 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]892
[16435]893 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]894 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
895 } else {
896 $cmd .= " > \"$output_filestem.err\"";
897 }
[1445]898
[2060]899 if (system($cmd)!=0)
[1445]900 {
901 print STDERR "Error executing $cmd: $!\n";
902 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
903 }
904
[2755]905 # make sure there is some extracted text.
906 if (-e "$output_filestem.text") {
907 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
908 binmode(EXTR_TEXT); # just in case...
909 my $line="";
910 my $seen_text=0;
911 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]912 if ($line=~ m/\w/) {$seen_text=1;}
[2755]913 }
914 close EXTR_TEXT;
915 if ($seen_text==0) { # no text was extracted
916 print STDERR "Error: pdftotext found no text\n";
917 &util::rm("$output_filestem.text");
918 }
919 }
920
[1692]921 # make sure the converter made something
[2656]922 if (! -s "$output_filestem.text")
[1692]923 {
924 # print out the converters std err, if any
[2656]925 if (-s "$output_filestem.err") {
[1692]926 open (ERRLOG, "$output_filestem.err") || die "$!";
[2755]927 print STDERR "pdftotext error log:\n";
[1692]928 while (<ERRLOG>) {
929 print STDERR "$_";
930 }
931 close ERRLOG;
932 }
[2656]933 # does this converter create a .out file?
934 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
935 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]936 if (-e "$output_filestem.err") {
937 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
938 {
939 open (ERRLOG,"$output_filestem.err");
940 while (<ERRLOG>) {print FAILLOG $_;}
941 close ERRLOG;
942 close FAILLOG;
943 }
944 &util::rm("$output_filestem.err");
945 }
[1692]946 return 0;
947 }
[1445]948 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
949 return 1;
950}
951
[2012]952# Convert a PostScript document to text
953# note - just using "ps2ascii" isn't good enough, as it
954# returns 0 for a postscript interpreter error. ps2ascii is just
955# a wrapper to "gs" anyway, so we use that cmd here.
[1445]956
957sub ps_to_text {
[2241]958 my ($input_filename, $output_filestem) = @_;
[1445]959
[2241]960 my $error = "";
961
962 # if we're on windows we'll fall straight through without attempting
963 # to use gs
[16435]964 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]965 $error = "Windows does not support gs";
966
967 } else {
[3538]968 my $cmd = "";
969 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
970 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]971 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]972 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]973 $cmd .= " 2> $output_filestem.err";
974 $!=0;
[10357]975
[2241]976 my $retcode=system($cmd);
977 $retcode = $? >> 8; # see man perlfunc - system for this...
978 # if system returns -1 | 127 (couldn't start program), look at $! for message
979
980 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
981 elsif (! -e "$output_filestem.text") {
982 $error="did not create output file.\n";
[2012]983 }
[2241]984 else
985 { # make sure the interpreter didn't get an error. It is technically
986 # possible for the actual text to start with this, but....
987 open PSOUT, "$output_filestem.text";
[16435]988 if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]989 $error="interpreter error - \"$1\"";
990 }
991 close PSOUT;
992 }
[2012]993 }
[2241]994
[2012]995 if ($error ne "")
[1445]996 {
[2755]997 print STDERR "Warning: Error executing gs: $error\n";
[1445]998 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]999
1000 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1001 {
1002 print FAILLOG "gs - $error\n";
1003 if (-e "$output_filestem.err") {
1004 open(ERRLOG, "$output_filestem.err");
1005 while (<ERRLOG>) {print FAILLOG $_;}
1006 close ERRLOG;
1007 }
1008 close FAILLOG;
1009 }
[1445]1010 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]1011
[2755]1012
[2012]1013 # Fine then. We'll just do a lousy job by ourselves...
[2031]1014 # Based on 5-line regexp sed script found at:
[2012]1015 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1016 #
[2755]1017 print STDERR "Stripping text from postscript\n";
[2012]1018 my $errorcode=0;
1019 open (IN, "$input_filename")
1020 || ($errorcode=1, warn "Couldn't read file: $!");
1021 open (OUT, ">$output_filestem.text")
1022 || ($errorcode=1, warn "Couldn't write file: $!");
1023 if ($errorcode) {print STDERR "errors\n";return 0;}
1024
[2031]1025 my $text=""; # this is for whole .ps file...
[2755]1026 $text = join('', <IN>); # see man perlport, under "System Resources"
[2031]1027 close IN;
1028
[2447]1029 # Make sure this is a ps file...
[16435]1030 if ($text !~ m/^%!/) {
[2755]1031 print STDERR "Bad postscript header: not '%!'\n";
1032 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1033 {
1034 print FAILLOG "Bad postscript header: not '%!'\n";
1035 close FAILLOG;
1036 }
[2447]1037 return 0;
1038 }
1039
[2031]1040 # if ps has Page data, then use it to delete all stuff before it.
1041 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1042
1043 # remove all leading non-data stuff
1044 $text =~ s/^.*?\(//s;
1045
1046 # remove all newline chars for easier processing
1047 $text =~ s/\n//g;
1048
1049 # Big assumption here - assume that if any co-ordinates are
1050 # given, then we are at the end of a sentence.
1051 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1052
1053 # special characters--
1054 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1055
1056 # ? ps text formatting (eg italics?) ?
1057 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1058 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1059 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1060 # default - remove the rest
1061 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1062
1063 # attempt to add whitespace between words...
1064 # this is based purely on observation, and may be completely wrong...
1065 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1066 # eg I notice "b(" is sometimes NOT a space if preceded by a
1067 # negative number.
1068 $text =~ s/\)\d+ ?b\(/\) \( /g;
1069
1070 # change quoted braces to brackets
1071 $text =~ s/([^\\])\\\(/$1\{/g;
1072 $text =~ s/([^\\])\\\)/$1\}/g ;
1073
1074 # remove everything that is not between braces
1075 $text =~ s/\)([^\(\)])+?\(//sg ;
1076
1077 # remove any Trailer eof stuff.
1078 $text =~ s/\)[^\)]*$//sg;
1079
1080 ### ligatures have special characters...
1081 $text =~ s/\\013/ff/g;
1082 $text =~ s/\\014/fi/g;
1083 $text =~ s/\\015/fl/g;
1084 $text =~ s/\\016/ffi/g;
1085 $text =~ s/\\214/fi/g;
1086 $text =~ s/\\215/fl/g;
1087 $text =~ s/\\017/\n\* /g; # asterisk?
1088 $text =~ s/\\023/\023/g; # e acute ('e)
1089 $text =~ s/\\177/\252/g; # u"
1090# $text =~ s/ ?? /\344/g; # a"
1091
1092 print OUT "$text";
1093 close OUT;
[1960]1094 }
[2600]1095 # wrap the text - use a minimum length. ie, first space after this length.
1096 my $wrap_length=72;
1097 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1098 open INFILE, "$output_filestem.text.tmp" ||
1099 die "Couldn't open file: $!";
1100 open OUTFILE, ">$output_filestem.text" ||
1101 die "Couldn't open file for writing: $!";
1102 my $line="";
1103 while ($line=<INFILE>) {
1104 while (length($line)>0) {
1105 if (length($line)>$wrap_length) {
1106 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1107 print OUTFILE "$1\n";
1108 } else {
1109 print OUTFILE "$line";
1110 $line="";
1111 }
1112 }
1113 }
1114 close INFILE;
1115 close OUTFILE;
1116 &util::rm("$output_filestem.text.tmp");
1117
[1445]1118 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1119 return 1;
1120}
1121
1122
1123# Convert any file to HTML with a crude perl implementation of the
1124# UNIX strings command.
1125
1126sub any_to_html {
[22429]1127 my ($input_filename, $output_filestem) = @_;
[1445]1128
1129 # First generate a text file
1130 return 0 unless (&any_to_text($input_filename, $output_filestem));
1131
1132 # create an HTML file from the text file
1133 open(TEXT, "<$output_filestem.text");
1134 open(HTML, ">$output_filestem.html");
1135
[2241]1136 print HTML "<html><head>\n";
1137 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1138 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1139 print HTML "</head><body>\n\n";
[1734]1140
[2755]1141 my $line;
1142 while ($line=<TEXT>) {
1143 $line =~ s/</&lt;/g;
1144 $line =~ s/>/&gt;/g;
[16435]1145 if ($line =~ m/^\s*$/) {
[2755]1146 print HTML "<p>";
1147 } else {
1148 print HTML "<br> ", $line;
1149 }
[1445]1150 }
[1734]1151 print HTML "\n</body></html>\n";
[1445]1152
[2241]1153 close HTML;
1154 close TEXT;
1155
[1445]1156 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1157 return 1;
1158}
1159
1160# Convert any file to TEXT with a crude perl implementation of the
1161# UNIX strings command.
[2755]1162# Note - this assumes ascii charsets :( (jrm21)
[1445]1163
1164sub any_to_text {
[22429]1165 my ($input_filename, $output_filestem) = @_;
[1445]1166
[3350]1167 if (!$use_strings) {
1168 return 0;
1169 }
[15120]1170
1171 print STDERR "\n**** In any to text****\n\n";
[2755]1172 open(IN, "<$input_filename") || return 0;
[1734]1173 binmode(IN);
[2755]1174 open(OUT, ">$output_filestem.text") || return 0;
[1445]1175
1176 my ($line);
[2755]1177 my $output_line_count = 0;
[1445]1178 while (<IN>) {
1179 $line = $_;
[1734]1180
[1445]1181 # delete anything that isn't a printable character
1182 $line =~ s/[^\040-\176]+/\n/sg;
1183
1184 # delete any string less than 10 characters long
[1734]1185 $line =~ s/^.{0,9}$/\n/mg;
[16435]1186 while ($line =~ m/^.{1,9}$/m) {
[1734]1187 $line =~ s/^.{0,9}$/\n/mg;
[1445]1188 $line =~ s/\n+/\n/sg;
1189 }
1190
1191 # remove extraneous whitespace
1192 $line =~ s/\n+/\n/gs;
1193 $line =~ s/^\n//gs;
[1578]1194
[1445]1195 # output whatever is left
[16435]1196 if ($line =~ m/[^\n ]/) {
[1445]1197 print OUT $line;
[2755]1198 ++$output_line_count;
[1445]1199 }
1200 }
[2241]1201
1202 close OUT;
1203 close IN;
1204
[2755]1205 if ($output_line_count) { # try to protect against binary only formats
1206 return 1;
1207 }
1208
1209 &util::rm("$output_filestem.text");
1210 return 0;
1211
[1445]1212}
Note: See TracBrowser for help on using the repository browser.