source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32205

Last change on this file since 32205 was 32205, checked in by ak19, 6 years ago

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 41.6 KB
RevLine 
[1445]1#!/usr/bin/perl -w
2
3###########################################################################
4#
[2032]5# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
[3013]11# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]33#
[3013]34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]38#
[3013]39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
[2032]42# We try to convert Postscript files to text using "gs" which is often on
[2755]43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
[1445]45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
[22429]51use strict;
52
[1445]53use parsargv;
54use util;
[27509]55use FileUtils;
[1445]56use Cwd;
57
[2755]58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]61
[3350]62my $use_strings;
[3720]63my $pdf_complex;
[4103]64my $pdf_nohidden;
[3720]65my $pdf_zoom;
66my $pdf_ignore_images;
[10451]67my $pdf_allow_images_only;
[10282]68my $windows_scripting;
[3350]69
[1445]70sub print_usage
71{
[1970]72 print STDERR "\n";
73 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74 print STDERR " or text using third-party programs.\n\n";
75 print STDERR " usage: $0 [options] filename\n";
[22642]76 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
[2755]77 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[22596]78 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
[2755]79 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]80 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[22568]81 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
[3720]82 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]83 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]84 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85 print STDERR "\t\tconverting PDF to HTML\n";
[10451]86 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]87 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88 print STDERR "\t\t-pdf_complex is set\n";
[1445]89 exit(1);
90}
91
[2755]92my $faillogfile="";
[3538]93my $timeout=0;
[24375]94my $verbosity=0;
[1445]95
96sub main
97{
98 my (@ARGV) = @_;
[3538]99 my ($input_type,$output_type,$verbose);
[1960]100
[23473]101 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
102 # is in use or not
103 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
104 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106 # Currently only have VBA for Word and PPT(but no XLS)
107 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
108
109 my $type_re = $default_type_re;
110
111 foreach my $a (@ARGV) {
112 if ($a =~ m/^windows_scripting$/i) {
113 $type_re = $enhanced_type_re;
114 }
115 }
116
[1445]117 # read command-line arguments
118 if (!parsargv::parse(\@ARGV,
[23473]119 "type/$type_re/", \$input_type,
[2755]120 '/errlog/.*/', \$faillogfile,
[22596]121 'output/(auto|html|text|pagedimg).*/', \$output_type,
[1692]122 'timeout/\d+/0',\$timeout,
[10282]123 'verbose/\d+/0', \$verbose,
[22429]124 'windows_scripting',\$windows_scripting,
[3720]125 'use_strings', \$use_strings,
126 'pdf_complex', \$pdf_complex,
[9482]127 'pdf_ignore_images', \$pdf_ignore_images,
[10451]128 'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]129 'pdf_nohidden', \$pdf_nohidden,
[3720]130 'pdf_zoom/\d+/2', \$pdf_zoom
131 ))
[1445]132 {
133 print_usage();
134 }
[24375]135
136 $verbosity=$verbose if defined $verbose;
137
[1445]138 # Make sure the input file exists and can be opened for reading
139 if (scalar(@ARGV!=1)) {
140 print_usage();
141 }
[1928]142
[1445]143 my $input_filename = $ARGV[0];
144 if (!-r $input_filename) {
145 print STDERR "Error: unable to open $input_filename for reading\n";
146 exit(1);
147 }
148
149 # Deduce filenames
150 my ($tailname,$dirname,$suffix)
[2241]151 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
[27509]152 my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
[1445]153
154 if ($input_type eq "")
155 {
[2241]156 $input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]157 }
158
159 # Change to temporary working directory
160 my $stored_dir = cwd();
161 chdir ($dirname) || die "Unable to change to directory $dirname";
[10357]162
[1445]163 # Select convert utility
164 if (!defined $input_type) {
165 print STDERR "Error: No filename extension or input type defined\n";
166 exit(1);
167 }
[23473]168 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
[1445]169 print &convertDOC($input_filename, $output_filestem, $output_type);
170 print "\n";
171 }
[1684]172 elsif ($input_type eq "rtf") {
173 print &convertRTF($input_filename, $output_filestem, $output_type);
174 print "\n";
175 }
[1445]176 elsif ($input_type eq "pdf") {
177 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type eq "ps") {
[22429]181 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
[1445]182 print "\n";
183 }
[23473]184 elsif ($input_type =~ m/pptx?$/) {
[2977]185 print &convertPPT($input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
[23473]188 elsif ($input_type =~ m/xlsx?$/) {
[2991]189 print &convertXLS($input_filename, $output_filestem, $output_type);
190 print "\n";
191 }
[1445]192 else {
193 print STDERR "Error: Unable to convert type '$input_type'\n";
194 exit(1);
195 }
196
197 # restore to original working directory
198 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
199
200}
201
202&main(@ARGV);
203
204
205
[2241]206# Document-type conversion functions
[1445]207#
208# The following functions attempt to convert documents from their
209# input type to the specified output type. If no output type was
210# given, then they first attempt HTML, and then TEXT.
211#
212# Each returns the output type ("html" or "text") or "fail" if no
213# conversion is possible.
214
215# Convert a Microsoft word document
216
217sub convertDOC {
[22429]218 my ($input_filename, $output_filestem, $output_type) = @_;
[1445]219
[1654]220 # Many .doc files are not in fact word documents!
221 my $realtype = &find_docfile_type($input_filename);
222
[23473]223 if ($realtype eq "word6" || $realtype eq "word7"
224 || $realtype eq "word8" || $realtype eq "docx") {
[1654]225 return &convertWord678($input_filename, $output_filestem, $output_type);
226 } elsif ($realtype eq "rtf") {
227 return &convertRTF($input_filename, $output_filestem, $output_type);
228 } else {
229 return &convertAnything($input_filename, $output_filestem, $output_type);
230 }
231}
232
233# Convert a Microsoft word 6/7/8 document
234
235sub convertWord678 {
[22429]236 my ($input_filename, $output_filestem, $output_type) = @_;
[1654]237
[1445]238 my $success = 0;
[16435]239 if (!$output_type || ($output_type =~ m/html/i)){
[10282]240 if ($windows_scripting) {
241 $success = &native_doc_to_html($input_filename, $output_filestem);
242 }
243 else {
244 $success = &doc_to_html($input_filename, $output_filestem);
245 }
[1445]246 if ($success) {
[10282]247 return "html";
[1445]248 }
249 }
[1654]250 return &convertAnything($input_filename, $output_filestem, $output_type);
251}
252
253
254# Convert a Rich Text Format (RTF) file
255
256sub convertRTF {
[22429]257 my ($input_filename, $output_filestem, $output_type) = @_;
[1654]258
259 my $success = 0;
260
261 # Attempt specialised conversion to HTML
[16435]262 if (!$output_type || ($output_type =~ m/html/i)) {
[12704]263
264 if ($windows_scripting) {
265 $success = &native_doc_to_html($input_filename, $output_filestem);
266 }
267 else {
268 $success = &rtf_to_html($input_filename, $output_filestem);
269 }
[1654]270 if ($success) {
271 return "html";
272 }
273 }
274
[2755]275# rtf is so ugly that's it's not worth running strings over.
276# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277# return &convertAnything($input_filename, $output_filestem, $output_type);
278 return "fail";
[1654]279}
280
281
282# Convert an unidentified file
283
284sub convertAnything {
[22429]285 my ($input_filename, $output_filestem, $output_type) = @_;
[1654]286
287 my $success = 0;
[10464]288
[1445]289 # Attempt simple conversion to HTML
[16435]290 if (!$output_type || ($output_type =~ m/html/i)) {
[1445]291 $success = &any_to_html($input_filename, $output_filestem);
292 if ($success) {
293 return "html";
294 }
295 }
296
297 # Convert to text
[16435]298 if (!$output_type || ($output_type =~ m/text/i)) {
[2241]299 $success = &any_to_text($input_filename, $output_filestem);
[1445]300 if ($success) {
301 return "text";
302 }
303 }
304 return "fail";
305}
306
307
[1654]308
[1445]309# Convert an Adobe PDF document
310
311sub convertPDF {
[2755]312 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]313
314 my $success = 0;
[10357]315 $output_type =~ s/.*\-(.*)/$1/i;
316 # Attempt coversion to Image
[16435]317 if ($output_type =~ m/jp?g|gif|png/i) {
[17329]318 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]319 if ($success){
320 return "item";
321 }
322 }
[1445]323
324 # Attempt conversion to HTML
[32205]325 # Uses the old pdftohtml that doesn't work for newer PDF versions
326 #if ($output_type =~ m/^html/i) {
327 if (!$output_type || ($output_type =~ m/^html/i)) {
[1445]328 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329 if ($success) {
330 return "html";
331 }
332 }
333
[32205]334 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335 # will be the new default for PDFs when output_type for PDF docs is not specified
336 # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337 if ($output_type =~ m/paged_html/i) {
338 #if (!$output_type || ($output_type =~ m/paged_html/i)) {
339 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340 if ($success) {
341 return "paged_html";
342 }
343 }
344
[1445]345 # Attempt conversion to TEXT
[16435]346 if (!$output_type || ($output_type =~ m/text/i)) {
[2117]347 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]348 if ($success) {
349 return "text";
350 }
351 }
352
353 return "fail";
354
355}
356
357
358# Convert an Adobe PostScript document
359
360sub convertPS {
[22429]361 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
[1445]362
363 my $success = 0;
[10534]364 $output_type =~ s/.*\-(.*)/$1/i;
365 # Attempt coversion to Image
[16435]366 if ($output_type =~ m/jp?g|gif|png/i) {
[17329]367 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]368 if ($success){
369 return "item";
370 }
371 }
[1445]372
373 # Attempt conversion to TEXT
[16435]374 if (!$output_type || ($output_type =~ m/text/i)) {
[1445]375 $success = &ps_to_text($input_filename, $output_filestem);
376 if ($success) {
377 return "text";
378 }
379 }
380 return "fail";
381}
382
383
[2977]384sub convertPPT {
385 my ($input_filename, $output_filestem, $output_type) = @_;
[10357]386 my $success = 0;
[2977]387
[10282]388 my $ppt_convert_type = "";
[22513]389
[16435]390 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
391 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392 if ($output_type =~ m/gif/i) {
[10282]393 $ppt_convert_type = "-g";
[16435]394 } elsif ($output_type =~ m/jp?g/i){
[10282]395 $ppt_convert_type = "-j";
[16435]396 } elsif ($output_type =~ m/png/i){
[10282]397 $ppt_convert_type = "-p";
398 }
[27509]399 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
[10282]400 $ENV{'GSDLOS'}, "pptextract");
[28355]401 $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402 # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
[10282]403
[22429]404 my $cmd = "";
[10357]405 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[22429]406 # if the converting directory already exists
[10282]407 if (-d $output_filestem) {
[22429]408 print STDERR "**The conversion directory already exists\n";
[10282]409 return "item";
410 } else {
[10521]411 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]412 $cmd .= " 2>\"$output_filestem.err\""
[16435]413 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[28355]414
[10282]415 if (system($cmd) !=0) {
416 print STDERR "Powerpoint VB Scripting convert failed\n";
417 } else {
418 return "item";
419 }
420 }
[16435]421 } elsif (!$output_type || ($output_type =~ m/html/i)) {
[10282]422 # Attempt conversion to HTML
[16435]423 #if (!$output_type || ($output_type =~ m/html/i)) {
[2977]424 # formulate the command
[22429]425 my $cmd = "";
[24362]426 my $full_perl_path = &util::get_perl_exec();
[24124]427 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
[2977]428 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429 $cmd .= " 2>\"$output_filestem.err\""
[16435]430 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[10357]431
[2977]432 # execute the command
433 $!=0;
434 if (system($cmd)!=0)
435 {
[2991]436 print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]437 } else {
438 return "html";
439 }
[10464]440 }
[2977]441
442 $success = &any_to_text($input_filename, $output_filestem);
443 if ($success) {
444 return "text";
445 }
[10464]446
[2977]447 return "fail";
448}
449
450
[2991]451sub convertXLS {
452 my ($input_filename, $output_filestem, $output_type) = @_;
[2977]453
[2991]454 my $success = 0;
[2977]455
[2991]456 # Attempt conversion to HTML
[16435]457 if (!$output_type || ($output_type =~ m/html/i)) {
[2991]458 # formulate the command
[22429]459 my $cmd = "";
[24362]460 my $full_perl_path = &util::get_perl_exec();
[24124]461 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
[2991]462 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463 $cmd .= " 2>\"$output_filestem.err\""
[16435]464 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[2991]465
466
467 # execute the command
468 $!=0;
469 if (system($cmd)!=0)
470 {
471 print STDERR "Excel 95/97 converter failed $!\n";
472 } else {
473 return "html";
474 }
475 }
[2977]476
[2991]477 $success = &any_to_text($input_filename, $output_filestem);
478 if ($success) {
479 return "text";
480 }
481
482 return "fail";
483}
484
485
486
[1654]487# Find the real type of a .doc file
488#
[2012]489# We seem to have a lot of files with a .doc extension that are .rtf
[1654]490# files or Word 5 files. This function attempts to tell the difference.
491sub find_docfile_type {
[22429]492 my ($input_filename) = @_;
[23473]493
494 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495 return "docx";
496 }
497
[1654]498 open(CHK, "<$input_filename");
[1734]499 binmode(CHK);
[1654]500 my $line = "";
501 my $first = 1;
502
503 while (<CHK>) {
504
505 $line = $_;
[1960]506
[1654]507 if ($first) {
508 # check to see if this is an rtf file
[16435]509 if ($line =~ m/^\{\\rtf/) {
[1654]510 close(CHK);
511 return "rtf";
512 }
[2755]513 $first = 0;
[1654]514 }
515
[1734]516 # is this is a word 6/7/8 document?
[16435]517 if ($line =~ m/Word\.Document\.([678])/) {
[1654]518 close(CHK);
[23473]519
[1734]520 return "word$1";
[1654]521 }
522
523 }
524
525 return "unknown";
526}
527
528
[1734]529# Specific type-to-type conversions
[1445]530#
531# Each of the following functions attempts to convert a document from
[2755]532# a specific format to another. If they succeed they return 1 and leave
[1445]533# the output document(s) in the appropriate place; if they fail they
534# return 0 and delete any working files.
535
536
537# Attempt to convert a word document to html with the wv program
538sub doc_to_html {
[22429]539 my ($input_filename, $output_filestem) = @_;
[1445]540
[24371]541 my $wvware_status = 0;
[24375]542
[24371]543 # need to ensure that the path to perl is quoted (in case there's spaces in it)
[24513]544 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
[15120]545
[30683]546# print STDERR "***** wvware launch cmd = $launch_cmd\n";
[15120]547
[24371]548 $wvware_status = system($launch_cmd)/256;
549 return $wvware_status;
[1445]550}
551
[10282]552# Attempt to convert a word document to html with the word2html scripting program
553sub native_doc_to_html {
[22429]554 my ($input_filename, $output_filestem) = @_;
[1445]555
[24166]556 # build up the path to the doc-to-html conversion tool we're going to use
[27509]557 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
[10282]558
[24164]559 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[24166]560 # if windows scripting with docx input, use new VBscript to get the local Word install (if
561 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562
563 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
564 # else script launch fails when there are error msgs
[27509]565 $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
[28355]566 $vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
[24169]567 # //Nologo flag avoids Microsoft's opening/logo msgs
568 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569 print STDERR " This may take some time. Please wait...\n";
[24166]570 }
571 else { # old doc versions. use the usual VB executable word2html for the
572 # conversion. Doesn't need full path, since bin\windows is on PATH
[27509]573 $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24166]574 }
575 }
576 else { # not windows
[27509]577 $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
[24164]578 }
579
[10445]580 if (-e "$output_filestem.html") {
[22429]581 print STDERR " The conversion file:\n";
582 print STDERR " $output_filestem.html\n";
583 print STDERR " ... already exists. Skipping\n";
[10445]584 return 1;
585 }
[10282]586
587 my $cmd = "";
588 if ($timeout) {$cmd = "ulimit -t $timeout;";}
589 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]590 #$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]591 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]592
[10282]593 # redirecting STDERR
[24166]594
595 $cmd .= " 2> \"$output_filestem.err\""
596 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
597 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
598
[10282]599 # execute the command
600 $!=0;
601 if (system($cmd)!=0)
602 {
[24164]603 print STDERR "Error executing $vbScript converter:$!\n";
[10282]604 if (-s "$output_filestem.err") {
605 open (ERRFILE, "<$output_filestem.err");
[24166]606
[10282]607 my $write_to_fail_log=0;
608 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609 {$write_to_fail_log=1;}
610
611 my $line;
612 while ($line=<ERRFILE>) {
[16435]613 if ($line =~ m/\w/) {
[10282]614 print STDERR "$line";
615 print FAILLOG "$line" if ($write_to_fail_log);
616 }
617 if ($line !~ m/startup error/) {next;}
618 print STDERR " (given an invalid .DOC file?)\n";
619 print FAILLOG " (given an invalid .DOC file?)\n"
620 if ($write_to_fail_log);
621
622 } # while ERRFILE
623 close FAILLOG if ($write_to_fail_log);
624 }
625 return 0; # we can try any_to_text
626 }
627
628 # Was the conversion successful?
629 if (-s "$output_filestem.html") {
630 open(TMP, "$output_filestem.html");
[22429]631 my $line = <TMP>;
[10282]632 close(TMP);
[22429]633 if ($line && $line =~ m/html/i) {
[27509]634 &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
[10282]635 return 1;
636 }
637 }
638
639 # If here, an error of some sort occurred
[27509]640 &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
[10282]641 if (-e "$output_filestem.err") {
642 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643 open (ERRLOG,"$output_filestem.err");
644 while (<ERRLOG>) {print FAILLOG $_;}
645 close FAILLOG;
646 close ERRLOG;
647 }
[27509]648 &FileUtils::removeFiles("$output_filestem.err");
[10282]649 }
650 return 0;
651}
652
[1654]653# Attempt to convert an RTF document to html with rtftohtml
654sub rtf_to_html {
[2241]655 my ($input_filename, $output_filestem) = @_;
[1654]656
657 # formulate the command
[22429]658 my $cmd = "";
[1692]659 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]660 $cmd .= "rtftohtml";
[10282]661 #$cmd .= "rtf-converter";
[1654]662
[3246]663 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]664
665 $cmd .= " 2>\"$output_filestem.err\""
[16435]666 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[2574]667
668
[1654]669 # execute the command
[2755]670 $!=0;
[2060]671 if (system($cmd)!=0)
[1654]672 {
[2755]673 print STDERR "Error executing rtf converter $!\n";
[2656]674 # don't currently bother printing out error log...
675 # keep going, in case it still created an HTML file...
[1654]676 }
677
678 # Was the conversion successful?
[2755]679 my $was_successful=0;
[2656]680 if (-s "$output_filestem.html") {
[2755]681 # make sure we have some content other than header
682 open (HTML, "$output_filestem.html"); # what to do if fail?
683 my $line;
684 my $past_header=0;
685 while ($line=<HTML>) {
686
687 if ($past_header == 0) {
[16435]688 if ($line =~ m/<body>/) {$past_header=1;}
[2755]689 next;
690 }
691
692 $line =~ s/<[^>]+>//g;
[16435]693 if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]694 $was_successful=1;
695 last;
696 }
697 }
698 close HTML;
[1654]699 }
[2574]700
[2755]701 if ($was_successful) {
[27509]702 &FileUtils::removeFiles("$output_filestem.err")
[2755]703 if (-e "$output_filestem.err");
704 # insert the (modified) table of contents, if it exists.
705 if (-e "${output_filestem}_ToC.html") {
[27509]706 &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
[2755]707 my $open_failed=0;
708 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
709 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
710 open HTML, ">$output_filestem.html" || ++$open_failed;
711
712 if ($open_failed) {
713 close HTMLSRC;
714 close TOC;
715 close HTML;
[27509]716 &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
[2755]717 return 1;
718 }
719
720 # print out header info from src html.
[16435]721 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]722 print HTML "$_";
723 }
724
725 # print out table of contents, making links relative
726 <TOC>; <TOC>; # ignore first 2 lines
727 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728 my $line;
729 while ($line=<TOC>) {
[22429]730 $line =~ s@</body></html>$@@i ; # only last line has this
[2755]731 # make link relative
[22429]732 $line =~ s@href=\"[^\#]+@href=\"@i;
[2755]733 print HTML $line;
734 }
735 close TOC;
736
737 # rest of html src
738 while (<HTMLSRC>) {
739 print HTML $_;
740 }
741 close HTMLSRC;
742 close HTML;
743
[27509]744 &FileUtils::removeFiles("${output_filestem}_ToC.html");
745 &FileUtils::removeFiles("${output_filestem}.src");
[2755]746 }
747 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748 return 1; # success
749 }
750
751 if (-e "$output_filestem.err") {
752 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753 {
754 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]755 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]756 print FAILLOG " (rtf file might be too recent):\n";
757 open (ERRLOG, "$output_filestem.err");
758 while (<ERRLOG>) {print FAILLOG $_;}
759 close ERRLOG;
760 close FAILLOG;
761 }
[27509]762 &FileUtils::removeFiles("$output_filestem.err");
[2755]763 }
764
[27509]765 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2656]766
[1654]767 return 0;
768}
769
770
[32205]771# Convert a pdf file to html with the old pdftohtml command
772# which only works for older PDF versions
[1445]773sub pdf_to_html {
[2755]774 my ($dirname, $input_filename, $output_filestem) = @_;
[1445]775
[22429]776 my $cmd = "";
[1692]777 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[24362]778 my $full_perl_path = &util::get_perl_exec();
[24124]779 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
[3720]780 $cmd .= " -c" if ($pdf_complex);
781 $cmd .= " -i" if ($pdf_ignore_images);
[10451]782 $cmd .= " -a" if ($pdf_allow_images_only);
[4103]783 $cmd .= " -hidden" unless ($pdf_nohidden);
[1928]784 $cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]785
[16435]786 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
[2755]787 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788 } else {
789 $cmd .= " > \"$output_filestem.err\"";
790 }
791
[2117]792 $!=0;
[2241]793
[2656]794 my $retval=system($cmd);
795 if ($retval!=0)
[1445]796 {
[2755]797 print STDERR "Error executing pdftohtml.pl";
[2117]798 if ($!) {print STDERR ": $!";}
799 print STDERR "\n";
[1445]800 }
801
[1692]802 # make sure the converter made something
[2656]803 if ($retval!=0 || ! -s "$output_filestem.html")
[1692]804 {
[27509]805 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[2656]806 # print out the converter's std err, if any
807 if (-s "$output_filestem.err") {
[1692]808 open (ERRLOG, "$output_filestem.err") || die "$!";
[2755]809 print STDERR "pdftohtml error log:\n";
[1692]810 while (<ERRLOG>) {
811 print STDERR "$_";
812 }
813 close ERRLOG;
814 }
[24608]815 #print STDERR "***********output filestem $output_filestem.html\n";
[27509]816 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[2755]817 if (-e "$output_filestem.err") {
818 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819 {
820 open (ERRLOG, "$output_filestem.err");
821 while (<ERRLOG>) {print FAILLOG $_;}
822 close ERRLOG;
823 close FAILLOG;
824 }
[27509]825 &FileUtils::removeFiles("$output_filestem.err");
[2755]826 }
[1692]827 return 0;
828 }
[10357]829
[27509]830 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]832 return 1;
833}
834
[32205]835
836# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837# This generates "paged HTML" where extracted, selectable text is positioned
838# over screenshots of each page.
839# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840# naming, the output files are created in a "pages" subdirectory of the tmp
841# location parent of $output_filestem instead
842sub xpdf_to_html {
843 my ($dirname, $input_filename, $output_filestem) = @_;
844
845 my $cmd = "";
846
847 # build up the path to the doc-to-html conversion tool we're going to use
848 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849
850 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
851 # TODO
852 } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
853 # TODO
854 } else { # unix, use the appropriate bin folder for the bitness of the system
855
856 # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
857 # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
858 # specific subdirectories exist in a greenstone installation.
859 # None of those locations need exist when xpdf-tools is installed with GS.
860 # So don't depend on GSDLARCH as forcing that to be exported has side-effects
861 if($ENV{'BITNESS'}) {
862 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
863 } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
864 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
865 }
866 }
867
868 # We'll create the file by name $output_filestem during post-conversion processing.
869 # Note that Xpdf tools will only create its conversion products in a dir that does
870 # not yet exist. So we'll create this location as a subdir of the output_filestem's
871 # parent directory. The parent dir is the already generated tmp area for conversion. So:
872 # - tmpdir gs2build/tmp/<random-num> already exists at this stage
873 # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
874 # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
875 my ($tailname, $tmp_dirname, $suffix)
876 = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
877 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
878
879 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
880 # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
881 $cmd .= "\"$xpdf_pdftohtml\"";
882 $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
883# $cmd .= " -c" if ($pdf_complex);
884# $cmd .= " -i" if ($pdf_ignore_images);
885# $cmd .= " -a" if ($pdf_allow_images_only);
886# $cmd .= " -hidden" unless ($pdf_nohidden);
887 $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
888 #$cmd .= " \"$input_filename\" \"$output_filestem\"";
889
890 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
891 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
892 } else {
893 $cmd .= " > \"$output_filestem.err\"";
894 }
895
896 #print STDERR "@@@@ Running command: $cmd\n";
897
898 $!=0;
899 my $retval=system($cmd);
900 if ($retval!=0)
901 {
902 print STDERR "Error executing xpdf's pdftohtml tool";
903 if ($!) {print STDERR ": $!";}
904 print STDERR "\n";
905 }
906
907 # make sure the converter made something
908 if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
909 {
910 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
911 # print out the converter's std err, if any
912 if (-s "$output_filestem.err") {
913 open (ERRLOG, "$output_filestem.err") || die "$!";
914 print STDERR "pdftohtml error log:\n";
915 while (<ERRLOG>) {
916 print STDERR "$_";
917 }
918 close ERRLOG;
919 }
920 #print STDERR "***********output filestem $output_filestem.html\n";
921 &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
922 if (-e "$output_filestem.err") {
923 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
924 {
925 open (ERRLOG, "$output_filestem.err");
926 while (<ERRLOG>) {print FAILLOG $_;}
927 close ERRLOG;
928 close FAILLOG;
929 }
930 &FileUtils::removeFiles("$output_filestem.err");
931 }
932 return 0;
933 }
934
935 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
936 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
937 return 1;
938}
939
940
941
[10357]942# Convert a pdf file to various types of image with the convert command
943
[17329]944sub pdfps_to_img {
[10357]945 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]946
947 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
948 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
[24600]949 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
[24763]950 $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
[24600]951 my $result = `$imagick_cmd identify 2>&1`;
952
953 # Linux and Windows return different values for "program not found".
954 # Linux returns -1 and Windows 256 for "program not found". But once they're
955 # converted to signed values, it will be -1 for Linux and 1 for Windows.
956 # Whenever we test for return values other than 0, shift by 8 and perform
957 # unsigned to signed status conversion on $? to get expected range of return vals
958 # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
959 # and then exits on that, by the time we get here, we need to do it again
960 my $status = $?;
961 $status >>= 8;
962 $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
[25798]963 if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
964 # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
[10401]965 #ImageMagick is not installed, thus the convert utility is not available.
[25798]966 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
[10401]967 return 0;
968 }
969 }
970
[22429]971 my $cmd = "";
[10357]972 if ($timeout) {$cmd = "ulimit -t $timeout;";}
973 $output_type =~ s/.*\_(.*)/$1/i;
[24362]974 my $full_perl_path = &util::get_perl_exec();
[24124]975 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]976 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
[10357]977 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
978 } else {
979 $cmd .= " > \"$output_filestem.err\"";
980 }
981
982 # don't include path on windows (to avoid having to play about
983 # with quoting when GSDLHOME might contain spaces) but assume
984 # that the PATH is set up correctly
985 $!=0;
986 my $retval=system($cmd);
987 if ($retval!=0)
988 {
[28166]989 print STDERR "Error executing pdfpstoimg.pl";
[10357]990 if ($!) {print STDERR ": $!";}
991 print STDERR "\n";
992 }
993
994 #make sure the converter made something
995 #if ($retval !=0) || ! -s "$output_filestem")
996 if ($retval !=0)
997 {
[27509]998 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[10357]999 #print out the converter's std err, if any
1000 if (-s "$output_filestem.err") {
1001 open (ERRLOG, "$output_filestem.err") || die "$!";
[17329]1002 print STDERR "pdfpstoimg error log:\n";
[10357]1003 while (<ERRLOG>) {
1004 print STDERR "$_";
1005 }
1006 close ERRLOG;
1007 }
[27509]1008 #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
[10357]1009 if (-e "$output_filestem.err") {
1010 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1011 {
1012 open (ERRLOG, "$output_filestem.err");
1013 while (<ERRLOG>) {print FAILLOG $_;}
1014 close ERRLOG;
1015 close FAILLOG;
1016 }
[27509]1017 &FileUtils::removeFiles("$output_filestem.err");
[10357]1018 }
1019 return 0;
1020 }
[27509]1021 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1022 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
[1445]1023 return 1;
1024}
1025
1026# Convert a PDF file to text with the pdftotext command
1027
1028sub pdf_to_text {
[2755]1029 my ($dirname, $input_filename, $output_filestem) = @_;
[1445]1030
[2248]1031 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]1032
[16435]1033 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]1034 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1035 } else {
1036 $cmd .= " > \"$output_filestem.err\"";
1037 }
[1445]1038
[2060]1039 if (system($cmd)!=0)
[1445]1040 {
1041 print STDERR "Error executing $cmd: $!\n";
[27509]1042 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]1043 }
1044
[2755]1045 # make sure there is some extracted text.
1046 if (-e "$output_filestem.text") {
1047 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1048 binmode(EXTR_TEXT); # just in case...
1049 my $line="";
1050 my $seen_text=0;
1051 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]1052 if ($line=~ m/\w/) {$seen_text=1;}
[2755]1053 }
1054 close EXTR_TEXT;
1055 if ($seen_text==0) { # no text was extracted
1056 print STDERR "Error: pdftotext found no text\n";
[27509]1057 &FileUtils::removeFiles("$output_filestem.text");
[2755]1058 }
1059 }
1060
[1692]1061 # make sure the converter made something
[2656]1062 if (! -s "$output_filestem.text")
[1692]1063 {
1064 # print out the converters std err, if any
[2656]1065 if (-s "$output_filestem.err") {
[1692]1066 open (ERRLOG, "$output_filestem.err") || die "$!";
[2755]1067 print STDERR "pdftotext error log:\n";
[1692]1068 while (<ERRLOG>) {
1069 print STDERR "$_";
1070 }
1071 close ERRLOG;
1072 }
[2656]1073 # does this converter create a .out file?
[27509]1074 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1075 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]1076 if (-e "$output_filestem.err") {
1077 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1078 {
1079 open (ERRLOG,"$output_filestem.err");
1080 while (<ERRLOG>) {print FAILLOG $_;}
1081 close ERRLOG;
1082 close FAILLOG;
1083 }
[27509]1084 &FileUtils::removeFiles("$output_filestem.err");
[2755]1085 }
[1692]1086 return 0;
1087 }
[27509]1088 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]1089 return 1;
1090}
1091
[2012]1092# Convert a PostScript document to text
1093# note - just using "ps2ascii" isn't good enough, as it
1094# returns 0 for a postscript interpreter error. ps2ascii is just
1095# a wrapper to "gs" anyway, so we use that cmd here.
[1445]1096
1097sub ps_to_text {
[2241]1098 my ($input_filename, $output_filestem) = @_;
[1445]1099
[2241]1100 my $error = "";
1101
1102 # if we're on windows we'll fall straight through without attempting
1103 # to use gs
[16435]1104 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]1105 $error = "Windows does not support gs";
1106
1107 } else {
[3538]1108 my $cmd = "";
1109 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1110 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]1111 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]1112 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]1113 $cmd .= " 2> $output_filestem.err";
1114 $!=0;
[10357]1115
[2241]1116 my $retcode=system($cmd);
1117 $retcode = $? >> 8; # see man perlfunc - system for this...
1118 # if system returns -1 | 127 (couldn't start program), look at $! for message
1119
1120 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1121 elsif (! -e "$output_filestem.text") {
1122 $error="did not create output file.\n";
[2012]1123 }
[2241]1124 else
1125 { # make sure the interpreter didn't get an error. It is technically
1126 # possible for the actual text to start with this, but....
1127 open PSOUT, "$output_filestem.text";
[16435]1128 if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]1129 $error="interpreter error - \"$1\"";
1130 }
1131 close PSOUT;
1132 }
[2012]1133 }
[2241]1134
[2012]1135 if ($error ne "")
[1445]1136 {
[2755]1137 print STDERR "Warning: Error executing gs: $error\n";
[30724]1138 print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
[27509]1139 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[2755]1140
1141 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1142 {
1143 print FAILLOG "gs - $error\n";
1144 if (-e "$output_filestem.err") {
1145 open(ERRLOG, "$output_filestem.err");
1146 while (<ERRLOG>) {print FAILLOG $_;}
1147 close ERRLOG;
1148 }
1149 close FAILLOG;
1150 }
[27509]1151 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[2012]1152
[2755]1153
[2012]1154 # Fine then. We'll just do a lousy job by ourselves...
[2031]1155 # Based on 5-line regexp sed script found at:
[2012]1156 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1157 #
[2755]1158 print STDERR "Stripping text from postscript\n";
[2012]1159 my $errorcode=0;
1160 open (IN, "$input_filename")
1161 || ($errorcode=1, warn "Couldn't read file: $!");
1162 open (OUT, ">$output_filestem.text")
1163 || ($errorcode=1, warn "Couldn't write file: $!");
1164 if ($errorcode) {print STDERR "errors\n";return 0;}
1165
[2031]1166 my $text=""; # this is for whole .ps file...
[2755]1167 $text = join('', <IN>); # see man perlport, under "System Resources"
[2031]1168 close IN;
1169
[2447]1170 # Make sure this is a ps file...
[16435]1171 if ($text !~ m/^%!/) {
[2755]1172 print STDERR "Bad postscript header: not '%!'\n";
1173 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1174 {
1175 print FAILLOG "Bad postscript header: not '%!'\n";
1176 close FAILLOG;
1177 }
[2447]1178 return 0;
1179 }
1180
[2031]1181 # if ps has Page data, then use it to delete all stuff before it.
1182 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1183
1184 # remove all leading non-data stuff
1185 $text =~ s/^.*?\(//s;
1186
1187 # remove all newline chars for easier processing
1188 $text =~ s/\n//g;
1189
1190 # Big assumption here - assume that if any co-ordinates are
1191 # given, then we are at the end of a sentence.
1192 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1193
1194 # special characters--
1195 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1196
1197 # ? ps text formatting (eg italics?) ?
1198 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1199 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1200 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1201 # default - remove the rest
1202 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1203
1204 # attempt to add whitespace between words...
1205 # this is based purely on observation, and may be completely wrong...
1206 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1207 # eg I notice "b(" is sometimes NOT a space if preceded by a
1208 # negative number.
1209 $text =~ s/\)\d+ ?b\(/\) \( /g;
1210
1211 # change quoted braces to brackets
1212 $text =~ s/([^\\])\\\(/$1\{/g;
1213 $text =~ s/([^\\])\\\)/$1\}/g ;
1214
1215 # remove everything that is not between braces
1216 $text =~ s/\)([^\(\)])+?\(//sg ;
1217
1218 # remove any Trailer eof stuff.
1219 $text =~ s/\)[^\)]*$//sg;
1220
1221 ### ligatures have special characters...
1222 $text =~ s/\\013/ff/g;
1223 $text =~ s/\\014/fi/g;
1224 $text =~ s/\\015/fl/g;
1225 $text =~ s/\\016/ffi/g;
1226 $text =~ s/\\214/fi/g;
1227 $text =~ s/\\215/fl/g;
1228 $text =~ s/\\017/\n\* /g; # asterisk?
1229 $text =~ s/\\023/\023/g; # e acute ('e)
1230 $text =~ s/\\177/\252/g; # u"
1231# $text =~ s/ ?? /\344/g; # a"
1232
1233 print OUT "$text";
1234 close OUT;
[1960]1235 }
[2600]1236 # wrap the text - use a minimum length. ie, first space after this length.
1237 my $wrap_length=72;
[27509]1238 &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
[2600]1239 open INFILE, "$output_filestem.text.tmp" ||
1240 die "Couldn't open file: $!";
1241 open OUTFILE, ">$output_filestem.text" ||
1242 die "Couldn't open file for writing: $!";
1243 my $line="";
1244 while ($line=<INFILE>) {
1245 while (length($line)>0) {
1246 if (length($line)>$wrap_length) {
1247 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1248 print OUTFILE "$1\n";
1249 } else {
1250 print OUTFILE "$line";
1251 $line="";
1252 }
1253 }
1254 }
1255 close INFILE;
1256 close OUTFILE;
[27509]1257 &FileUtils::removeFiles("$output_filestem.text.tmp");
[2600]1258
[27509]1259 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
[1445]1260 return 1;
1261}
1262
1263
1264# Convert any file to HTML with a crude perl implementation of the
1265# UNIX strings command.
1266
1267sub any_to_html {
[22429]1268 my ($input_filename, $output_filestem) = @_;
[1445]1269
1270 # First generate a text file
1271 return 0 unless (&any_to_text($input_filename, $output_filestem));
1272
1273 # create an HTML file from the text file
1274 open(TEXT, "<$output_filestem.text");
1275 open(HTML, ">$output_filestem.html");
1276
[2241]1277 print HTML "<html><head>\n";
1278 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1279 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1280 print HTML "</head><body>\n\n";
[1734]1281
[2755]1282 my $line;
1283 while ($line=<TEXT>) {
1284 $line =~ s/</&lt;/g;
1285 $line =~ s/>/&gt;/g;
[16435]1286 if ($line =~ m/^\s*$/) {
[2755]1287 print HTML "<p>";
1288 } else {
1289 print HTML "<br> ", $line;
1290 }
[1445]1291 }
[1734]1292 print HTML "\n</body></html>\n";
[1445]1293
[2241]1294 close HTML;
1295 close TEXT;
1296
[27509]1297 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
[1445]1298 return 1;
1299}
1300
1301# Convert any file to TEXT with a crude perl implementation of the
1302# UNIX strings command.
[2755]1303# Note - this assumes ascii charsets :( (jrm21)
[1445]1304
1305sub any_to_text {
[22429]1306 my ($input_filename, $output_filestem) = @_;
[1445]1307
[3350]1308 if (!$use_strings) {
1309 return 0;
1310 }
[15120]1311
1312 print STDERR "\n**** In any to text****\n\n";
[2755]1313 open(IN, "<$input_filename") || return 0;
[1734]1314 binmode(IN);
[2755]1315 open(OUT, ">$output_filestem.text") || return 0;
[1445]1316
1317 my ($line);
[2755]1318 my $output_line_count = 0;
[1445]1319 while (<IN>) {
1320 $line = $_;
[1734]1321
[1445]1322 # delete anything that isn't a printable character
1323 $line =~ s/[^\040-\176]+/\n/sg;
1324
1325 # delete any string less than 10 characters long
[1734]1326 $line =~ s/^.{0,9}$/\n/mg;
[16435]1327 while ($line =~ m/^.{1,9}$/m) {
[1734]1328 $line =~ s/^.{0,9}$/\n/mg;
[1445]1329 $line =~ s/\n+/\n/sg;
1330 }
1331
1332 # remove extraneous whitespace
1333 $line =~ s/\n+/\n/gs;
1334 $line =~ s/^\n//gs;
[1578]1335
[1445]1336 # output whatever is left
[16435]1337 if ($line =~ m/[^\n ]/) {
[1445]1338 print OUT $line;
[2755]1339 ++$output_line_count;
[1445]1340 }
1341 }
[2241]1342
1343 close OUT;
1344 close IN;
1345
[2755]1346 if ($output_line_count) { # try to protect against binary only formats
1347 return 1;
1348 }
1349
[27509]1350 &FileUtils::removeFiles("$output_filestem.text");
[2755]1351 return 0;
1352
[1445]1353}
Note: See TracBrowser for help on using the repository browser.