source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 23473

Last change on this file since 23473 was 23473, checked in by ak19, 13 years ago

Provision for supporting .docx and .pptx files when Windows scripting is on.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 45.2 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56use File::Basename;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69
70sub print_usage
71{
72 print STDERR "\n";
73 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74 print STDERR " or text using third-party programs.\n\n";
75 print STDERR " usage: $0 [options] filename\n";
76 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
77 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
79 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85 print STDERR "\t\tconverting PDF to HTML\n";
86 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88 print STDERR "\t\t-pdf_complex is set\n";
89 exit(1);
90}
91
92my $faillogfile="";
93my $timeout=0;
94
95sub main
96{
97 my (@ARGV) = @_;
98 my ($input_type,$output_type,$verbose);
99
100 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
101 # is in use or not
102 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
103 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
104 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105 # Currently only have VBA for Word and PPT(but no XLS)
106 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
107
108 my $type_re = $default_type_re;
109
110 foreach my $a (@ARGV) {
111 if ($a =~ m/^windows_scripting$/i) {
112 $type_re = $enhanced_type_re;
113 }
114 }
115
116 # read command-line arguments
117 if (!parsargv::parse(\@ARGV,
118 "type/$type_re/", \$input_type,
119 '/errlog/.*/', \$faillogfile,
120 'output/(auto|html|text|pagedimg).*/', \$output_type,
121 'timeout/\d+/0',\$timeout,
122 'verbose/\d+/0', \$verbose,
123 'windows_scripting',\$windows_scripting,
124 'use_strings', \$use_strings,
125 'pdf_complex', \$pdf_complex,
126 'pdf_ignore_images', \$pdf_ignore_images,
127 'pdf_allow_images_only', \$pdf_allow_images_only,
128 'pdf_nohidden', \$pdf_nohidden,
129 'pdf_zoom/\d+/2', \$pdf_zoom
130 ))
131 {
132 print_usage();
133 }
134
135 # Make sure the input file exists and can be opened for reading
136 if (scalar(@ARGV!=1)) {
137 print_usage();
138 }
139
140 my $input_filename = $ARGV[0];
141 if (!-r $input_filename) {
142 print STDERR "Error: unable to open $input_filename for reading\n";
143 exit(1);
144 }
145
146 # Deduce filenames
147 my ($tailname,$dirname,$suffix)
148 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
149 my $output_filestem = &util::filename_cat($dirname, "$tailname");
150
151 if ($input_type eq "")
152 {
153 $input_type = lc (substr($suffix,1,length($suffix)-1));
154 }
155
156 # Change to temporary working directory
157 my $stored_dir = cwd();
158 chdir ($dirname) || die "Unable to change to directory $dirname";
159
160 # Select convert utility
161 if (!defined $input_type) {
162 print STDERR "Error: No filename extension or input type defined\n";
163 exit(1);
164 }
165 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
166 print &convertDOC($input_filename, $output_filestem, $output_type);
167 print "\n";
168 }
169 elsif ($input_type eq "rtf") {
170 print &convertRTF($input_filename, $output_filestem, $output_type);
171 print "\n";
172 }
173 elsif ($input_type eq "pdf") {
174 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
175 print "\n";
176 }
177 elsif ($input_type eq "ps") {
178 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
179 print "\n";
180 }
181 elsif ($input_type =~ m/pptx?$/) {
182 print &convertPPT($input_filename, $output_filestem, $output_type);
183 print "\n";
184 }
185 elsif ($input_type =~ m/xlsx?$/) {
186 print &convertXLS($input_filename, $output_filestem, $output_type);
187 print "\n";
188 }
189 else {
190 print STDERR "Error: Unable to convert type '$input_type'\n";
191 exit(1);
192 }
193
194 # restore to original working directory
195 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
196
197}
198
199&main(@ARGV);
200
201
202
203# Document-type conversion functions
204#
205# The following functions attempt to convert documents from their
206# input type to the specified output type. If no output type was
207# given, then they first attempt HTML, and then TEXT.
208#
209# Each returns the output type ("html" or "text") or "fail" if no
210# conversion is possible.
211
212# Convert a Microsoft word document
213
214sub convertDOC {
215 my ($input_filename, $output_filestem, $output_type) = @_;
216
217 # Many .doc files are not in fact word documents!
218 my $realtype = &find_docfile_type($input_filename);
219
220 if ($realtype eq "word6" || $realtype eq "word7"
221 || $realtype eq "word8" || $realtype eq "docx") {
222 return &convertWord678($input_filename, $output_filestem, $output_type);
223 } elsif ($realtype eq "rtf") {
224 return &convertRTF($input_filename, $output_filestem, $output_type);
225 } else {
226 return &convertAnything($input_filename, $output_filestem, $output_type);
227 }
228}
229
230# Convert a Microsoft word 6/7/8 document
231
232sub convertWord678 {
233 my ($input_filename, $output_filestem, $output_type) = @_;
234
235 my $success = 0;
236 if (!$output_type || ($output_type =~ m/html/i)){
237 if ($windows_scripting) {
238 $success = &native_doc_to_html($input_filename, $output_filestem);
239 }
240 else {
241 $success = &doc_to_html($input_filename, $output_filestem);
242 }
243 if ($success) {
244 return "html";
245 }
246 }
247 return &convertAnything($input_filename, $output_filestem, $output_type);
248}
249
250
251# Convert a Rich Text Format (RTF) file
252
253sub convertRTF {
254 my ($input_filename, $output_filestem, $output_type) = @_;
255
256 my $success = 0;
257
258 # Attempt specialised conversion to HTML
259 if (!$output_type || ($output_type =~ m/html/i)) {
260
261 if ($windows_scripting) {
262 $success = &native_doc_to_html($input_filename, $output_filestem);
263 }
264 else {
265 $success = &rtf_to_html($input_filename, $output_filestem);
266 }
267 if ($success) {
268 return "html";
269 }
270 }
271
272# rtf is so ugly that's it's not worth running strings over.
273# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
274# return &convertAnything($input_filename, $output_filestem, $output_type);
275 return "fail";
276}
277
278
279# Convert an unidentified file
280
281sub convertAnything {
282 my ($input_filename, $output_filestem, $output_type) = @_;
283
284 my $success = 0;
285
286 # Attempt simple conversion to HTML
287 if (!$output_type || ($output_type =~ m/html/i)) {
288 $success = &any_to_html($input_filename, $output_filestem);
289 if ($success) {
290 return "html";
291 }
292 }
293
294 # Convert to text
295 if (!$output_type || ($output_type =~ m/text/i)) {
296 $success = &any_to_text($input_filename, $output_filestem);
297 if ($success) {
298 return "text";
299 }
300 }
301 return "fail";
302}
303
304
305
306# Convert an Adobe PDF document
307
308sub convertPDF {
309 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
310
311 my $success = 0;
312 $output_type =~ s/.*\-(.*)/$1/i;
313 # Attempt coversion to Image
314 if ($output_type =~ m/jp?g|gif|png/i) {
315 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
316 if ($success){
317 return "item";
318 }
319 }
320
321 # Attempt conversion to HTML
322 if (!$output_type || ($output_type =~ m/html/i)) {
323 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
324 if ($success) {
325 return "html";
326 }
327 }
328
329 # Attempt conversion to TEXT
330 if (!$output_type || ($output_type =~ m/text/i)) {
331 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
332 if ($success) {
333 return "text";
334 }
335 }
336
337 return "fail";
338
339}
340
341
342# Convert an Adobe PostScript document
343
344sub convertPS {
345 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
346
347 my $success = 0;
348 $output_type =~ s/.*\-(.*)/$1/i;
349 # Attempt coversion to Image
350 if ($output_type =~ m/jp?g|gif|png/i) {
351 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
352 if ($success){
353 return "item";
354 }
355 }
356
357 # Attempt conversion to TEXT
358 if (!$output_type || ($output_type =~ m/text/i)) {
359 $success = &ps_to_text($input_filename, $output_filestem);
360 if ($success) {
361 return "text";
362 }
363 }
364 return "fail";
365}
366
367
368sub convertPPT {
369 my ($input_filename, $output_filestem, $output_type) = @_;
370 my $success = 0;
371
372 my $ppt_convert_type = "";
373
374 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
375 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
376 if ($output_type =~ m/gif/i) {
377 $ppt_convert_type = "-g";
378 } elsif ($output_type =~ m/jp?g/i){
379 $ppt_convert_type = "-j";
380 } elsif ($output_type =~ m/png/i){
381 $ppt_convert_type = "-p";
382 }
383 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
384 $ENV{'GSDLOS'}, "pptextract");
385 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
386
387 my $cmd = "";
388 if ($timeout) {$cmd = "ulimit -t $timeout;";}
389 # if the converting directory already exists
390 if (-d $output_filestem) {
391 print STDERR "**The conversion directory already exists\n";
392 return "item";
393 } else {
394 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
395 $cmd .= " 2>\"$output_filestem.err\""
396 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
397 if (system($cmd) !=0) {
398 print STDERR "Powerpoint VB Scripting convert failed\n";
399 } else {
400 return "item";
401 }
402 }
403 } elsif (!$output_type || ($output_type =~ m/html/i)) {
404 # Attempt conversion to HTML
405 #if (!$output_type || ($output_type =~ m/html/i)) {
406 # formulate the command
407 my $cmd = "";
408 $cmd .= "perl -S ppttohtml.pl ";
409 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410 $cmd .= " 2>\"$output_filestem.err\""
411 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
412
413 # execute the command
414 $!=0;
415 if (system($cmd)!=0)
416 {
417 print STDERR "Powerpoint 95/97 converter failed $!\n";
418 } else {
419 return "html";
420 }
421 }
422
423 $success = &any_to_text($input_filename, $output_filestem);
424 if ($success) {
425 return "text";
426 }
427
428 return "fail";
429}
430
431
432sub convertXLS {
433 my ($input_filename, $output_filestem, $output_type) = @_;
434
435 my $success = 0;
436
437 # Attempt conversion to HTML
438 if (!$output_type || ($output_type =~ m/html/i)) {
439 # formulate the command
440 my $cmd = "";
441 $cmd .= "perl -S xlstohtml.pl ";
442 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
443 $cmd .= " 2>\"$output_filestem.err\""
444 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
445
446
447 # execute the command
448 $!=0;
449 if (system($cmd)!=0)
450 {
451 print STDERR "Excel 95/97 converter failed $!\n";
452 } else {
453 return "html";
454 }
455 }
456
457 $success = &any_to_text($input_filename, $output_filestem);
458 if ($success) {
459 return "text";
460 }
461
462 return "fail";
463}
464
465
466
467# Find the real type of a .doc file
468#
469# We seem to have a lot of files with a .doc extension that are .rtf
470# files or Word 5 files. This function attempts to tell the difference.
471sub find_docfile_type {
472 my ($input_filename) = @_;
473
474 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
475 return "docx";
476 }
477
478 open(CHK, "<$input_filename");
479 binmode(CHK);
480 my $line = "";
481 my $first = 1;
482
483 while (<CHK>) {
484
485 $line = $_;
486
487 if ($first) {
488 # check to see if this is an rtf file
489 if ($line =~ m/^\{\\rtf/) {
490 close(CHK);
491 return "rtf";
492 }
493 $first = 0;
494 }
495
496 # is this is a word 6/7/8 document?
497 if ($line =~ m/Word\.Document\.([678])/) {
498 close(CHK);
499
500 return "word$1";
501 }
502
503 }
504
505 return "unknown";
506}
507
508
509# Specific type-to-type conversions
510#
511# Each of the following functions attempts to convert a document from
512# a specific format to another. If they succeed they return 1 and leave
513# the output document(s) in the appropriate place; if they fail they
514# return 0 and delete any working files.
515
516
517# Attempt to convert a word document to html with the wv program
518sub doc_to_html {
519 my ($input_filename, $output_filestem) = @_;
520
521 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
522
523 if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
524 $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
525 $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
526 $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
527 }
528
529 # don't include path on windows (to avoid having to play about
530 # with quoting when GSDLHOME might contain spaces) but assume
531 # that the PATH is set up correctly
532 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
533
534 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
535 "packages", "wv", "wvHtml.xml");
536
537 # Added the following to work with replace_srcdoc_with_html.pl:
538 # Make wvWare put any associated (image) files of the word doc into
539 # folder docname-without-extention_files. This folder should be at
540 # the same level as the html file generated from the doc.
541 # wvWare will take care of proper interlinking.
542
543 # This step is necessary for replace_srcdoc_with_html.pl which will
544 # move the html and associated files into the import folder. We
545 # want to ensure that the associated files won't overwrite similarly
546 # named items already in import. Hence we put them in a folder first
547 # (to which the html links properly) and that will allow
548 # replace_srcdoc_with_html.pl to move them safely to /import.
549
550 # To do all this, we need to use wvWare's --dir and --basename options
551 # where dir is the full path to the image folder directory and
552 # basename is the full path to the image folder appended to the name
553 # which is to be prepended to every image file:
554 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
555 # then the basename is "/full/path/to/imgdir/sample".
556 # In this case, basename is the full path to and name of the document.
557 # HOWEVER: basename always takes full path, not relative url, so
558 # the greenstone browser is unable to display the images (absolute paths
559 # cause it to give an "external link" message)
560 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
561 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
562 # "added --dir option to wvHtml so that pictures can be placed in
563 # a seperate directory"
564 # "running wvWare through IMP to view word documents as html. It gets
565 # invoked like this:
566 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
567
568 # toppath is the folder where html is generated
569 # docname is the name (without extension) of the html to be generated
570 # suffix (extension) is thrown away
571 my ($docname, $toppath)
572 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
573
574 # We want the image folder generated to have the same name as windows
575 # would generate ($windows_scripting) when it converts from word to html.
576 # That is, foldername=docname_files
577 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
578 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
579
580 # ensure this image directory exists
581 # if it exists already, just delete and recreate
582 if(-e $assoc_dir) {
583 &util::rm_r($assoc_dir);
584 }
585 &util::mk_dir($assoc_dir);
586
587 # the images are all going to be called image0, image1,..., imageN
588 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
589
590 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
591 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
592 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
593
594 my $cmd = "";
595 if ($timeout) {$cmd = "ulimit -t $timeout;";}
596 # wvWare's --dir and --basename options for image directory.
597 # Replaced the next line with the *2 lines* following it:
598 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
599 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
600 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
601 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
602
603 # redirecting STDERR is a bad idea on windows 95/98
604 $cmd .= " 2> \"$output_filestem.err\""
605 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
606 # execute the command
607 $!=0;
608 if (system($cmd)!=0)
609 {
610 print STDERR "Error executing wv converter:$!\n";
611 if (-s "$output_filestem.err") {
612 open (ERRFILE, "<$output_filestem.err");
613
614 my $write_to_fail_log=0;
615 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
616 {$write_to_fail_log=1;}
617
618 my $line;
619 while ($line=<ERRFILE>) {
620 if ($line =~ m/\w/) {
621 print STDERR "$line";
622 print FAILLOG "$line" if ($write_to_fail_log);
623 }
624 if ($line !~ m/startup error/) {next;}
625 print STDERR " (given an invalid .DOC file?)\n";
626 print FAILLOG " (given an invalid .DOC file?)\n"
627 if ($write_to_fail_log);
628
629 } # while ERRFILE
630 close FAILLOG if ($write_to_fail_log);
631 }
632 return 0; # we can try any_to_text
633 }
634
635 # Was the conversion successful?
636
637 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
638 open(TMP, "$output_filestem.html");
639 my $line = <TMP>;
640 close(TMP);
641 if ($line && $line =~ m/DOCTYPE HTML/) {
642 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
643
644 # Inserted this code to remove the images directory if it was still empty after
645 # the html was generated (in case there were no images in the word document)
646 if (&util::is_dir_empty($assoc_dir)) {
647 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
648 &util::rm_r($assoc_dir);
649 } else { # there was an image folder (it was generated)
650 # Therefore, the html file generated contains absolute links to the images
651 # Replace them with relative links instead, so the folder can be moved elsewhere
652 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
653 }
654 return 1;
655 }
656 }
657
658 # If here, an error of some sort occurred
659 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
660 if (-e "$output_filestem.err") {
661 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
662 open (ERRLOG,"$output_filestem.err");
663 while (<ERRLOG>) {print FAILLOG $_;}
664 close FAILLOG;
665 close ERRLOG;
666 }
667 &util::rm("$output_filestem.err");
668 }
669
670 return 0;
671}
672
673# Method to work with doc_to_html - Word docs might contain images.
674# When such word docs are converted with wvWare, we make it generate a
675# <filename>_files folder with the associated images, while the html file
676# <filename> refers to the images using absolute paths to <filename>_files.
677# This method reads in that html file and replaces all the absolute paths to
678# the images in <filename>_files with the relative paths to the images from
679# that folder. (I.e. with <filename>_files/<imagename.ext>).
680sub make_links_to_assocdir_relative{
681 # toppath is the top-level folder in which the html file we're going to be fixing resides
682 # docname is just the name (without extension) of the html file
683 # html_file is the full path to the html file: /full/path/docname.html
684 # assoc_dir_path is toppath/docname_files
685 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
686 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
687
688 # 1. Read all the contents of the html into a string
689 # open the original file for reading
690 unless(open(FIN, "<$html_file")) {
691 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
692 return 0;
693 }
694 # From http://perl.plover.com/local.html
695 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
696 # (Some people call this slurping the file.) Perl has a special feature to support this:
697 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
698 my $html_contents;
699 {
700 local $/ = undef; # Read entire file at once
701 $html_contents = <FIN>; # Now file is read in as one single 'line'
702 }
703 close(FIN); # close the file
704 #print STDERR $html_contents;
705
706 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
707 # values with assoc_dirname
708 # At the end: g means substitute all occurrences (global), while s at the end means treat
709 # all new lines as a regular space. This interacts with g to consider all the lines
710 # together as a single line so that multi-occurrences can be replaced.
711
712 # we can't just replace $assoc_dir_path with $assoc_dir
713 # $assoc_dir_path represents a regular expression that needs to be replaced
714 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
715 # meaning in Perl regular expressions -- we need to escape these first
716 my $safe_reg_expression = $assoc_dir_path;
717 $safe_reg_expression =~ s/\\/\\\\/g;
718 $safe_reg_expression =~ s/\./\\./g;
719 $safe_reg_expression =~ s/\-/\\-/g;
720 $safe_reg_expression =~ s/\[/\\[/g;
721 $safe_reg_expression =~ s/\]/\\]/g;
722 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
723
724 # The following regular expression substitution looks for <a or <image, followed by any other
725 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
726 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
727 # followed by characters (for the img filename), then finally the optional closing quotes
728 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
729 # The substitution: all the parts preceding associated folder's pathname are retained,
730 # the associated folder path name is replaced by associated folder directory name
731 # and the rest upto and including the closing > tag is retained.
732 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
733 # and performs a global replace (g) meaning that all occurrences that match in that single line
734 # are substituted.
735 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
736 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
737 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
738 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
739
740 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
741 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
742
743 # delete the original file and recreate it
744 my $copy_of_filename = $html_file;
745 &util::rm($copy_of_filename); # deleted the file
746
747 # Recreate the original file for writing the updated contents
748 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
749 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
750 return 0;
751 }
752
753 # write out the updated contents and close the file
754 print FOUT $html_contents;
755 close(FOUT);
756 return 1;
757}
758
759# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
760# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
761# introduced in link pathnames by wvWare into space again. Converts all percent signs
762# introduced by URL encoding filenames generated into %25 in these url links referencing them
763sub post_process_assocfile_urls
764{
765 my ($pre, $text, $post) = @_;
766
767 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
768 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
769 $text =~ s/\\/\//g;
770 $text =~ s/%/%25/g;
771
772 return "$pre$text$post";
773}
774
775# Attempt to convert a word document to html with the word2html scripting program
776sub native_doc_to_html {
777 my ($input_filename, $output_filestem) = @_;
778
779 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
780 $ENV{'GSDLOS'}, "word2html");
781
782 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
783 if (-e "$output_filestem.html") {
784 print STDERR " The conversion file:\n";
785 print STDERR " $output_filestem.html\n";
786 print STDERR " ... already exists. Skipping\n";
787 return 1;
788 }
789
790 my $cmd = "";
791 if ($timeout) {$cmd = "ulimit -t $timeout;";}
792 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
793 #$cmd .= "$vbScript $input_filename $output_filestem.html";
794 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
795
796 # redirecting STDERR
797 $cmd .= " 2> \"$output_filestem.err\""
798 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
799
800 # execute the command
801 $!=0;
802 if (system($cmd)!=0)
803 {
804 print STDERR "Error executing word2Html converter:$!\n";
805 if (-s "$output_filestem.err") {
806 open (ERRFILE, "<$output_filestem.err");
807
808 my $write_to_fail_log=0;
809 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
810 {$write_to_fail_log=1;}
811
812 my $line;
813 while ($line=<ERRFILE>) {
814 if ($line =~ m/\w/) {
815 print STDERR "$line";
816 print FAILLOG "$line" if ($write_to_fail_log);
817 }
818 if ($line !~ m/startup error/) {next;}
819 print STDERR " (given an invalid .DOC file?)\n";
820 print FAILLOG " (given an invalid .DOC file?)\n"
821 if ($write_to_fail_log);
822
823 } # while ERRFILE
824 close FAILLOG if ($write_to_fail_log);
825 }
826 return 0; # we can try any_to_text
827 }
828
829 # Was the conversion successful?
830 if (-s "$output_filestem.html") {
831 open(TMP, "$output_filestem.html");
832 my $line = <TMP>;
833 close(TMP);
834 if ($line && $line =~ m/html/i) {
835 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
836 return 1;
837 }
838 }
839
840 # If here, an error of some sort occurred
841 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
842 if (-e "$output_filestem.err") {
843 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
844 open (ERRLOG,"$output_filestem.err");
845 while (<ERRLOG>) {print FAILLOG $_;}
846 close FAILLOG;
847 close ERRLOG;
848 }
849 &util::rm("$output_filestem.err");
850 }
851 return 0;
852}
853
854# Attempt to convert an RTF document to html with rtftohtml
855sub rtf_to_html {
856 my ($input_filename, $output_filestem) = @_;
857
858 # formulate the command
859 my $cmd = "";
860 if ($timeout) {$cmd = "ulimit -t $timeout;";}
861 $cmd .= "rtftohtml";
862 #$cmd .= "rtf-converter";
863
864 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
865
866 $cmd .= " 2>\"$output_filestem.err\""
867 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
868
869
870 # execute the command
871 $!=0;
872 if (system($cmd)!=0)
873 {
874 print STDERR "Error executing rtf converter $!\n";
875 # don't currently bother printing out error log...
876 # keep going, in case it still created an HTML file...
877 }
878
879 # Was the conversion successful?
880 my $was_successful=0;
881 if (-s "$output_filestem.html") {
882 # make sure we have some content other than header
883 open (HTML, "$output_filestem.html"); # what to do if fail?
884 my $line;
885 my $past_header=0;
886 while ($line=<HTML>) {
887
888 if ($past_header == 0) {
889 if ($line =~ m/<body>/) {$past_header=1;}
890 next;
891 }
892
893 $line =~ s/<[^>]+>//g;
894 if ($line =~ m/\w/ && $past_header) { # we found some content...
895 $was_successful=1;
896 last;
897 }
898 }
899 close HTML;
900 }
901
902 if ($was_successful) {
903 &util::rm("$output_filestem.err")
904 if (-e "$output_filestem.err");
905 # insert the (modified) table of contents, if it exists.
906 if (-e "${output_filestem}_ToC.html") {
907 &util::mv("$output_filestem.html","$output_filestem.src");
908 my $open_failed=0;
909 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
910 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
911 open HTML, ">$output_filestem.html" || ++$open_failed;
912
913 if ($open_failed) {
914 close HTMLSRC;
915 close TOC;
916 close HTML;
917 &util::mv("$output_filestem.src","$output_filestem.html");
918 return 1;
919 }
920
921 # print out header info from src html.
922 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
923 print HTML "$_";
924 }
925
926 # print out table of contents, making links relative
927 <TOC>; <TOC>; # ignore first 2 lines
928 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
929 my $line;
930 while ($line=<TOC>) {
931 $line =~ s@</body></html>$@@i ; # only last line has this
932 # make link relative
933 $line =~ s@href=\"[^\#]+@href=\"@i;
934 print HTML $line;
935 }
936 close TOC;
937
938 # rest of html src
939 while (<HTMLSRC>) {
940 print HTML $_;
941 }
942 close HTMLSRC;
943 close HTML;
944
945 &util::rm("${output_filestem}_ToC.html");
946 &util::rm("${output_filestem}.src");
947 }
948 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
949 return 1; # success
950 }
951
952 if (-e "$output_filestem.err") {
953 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
954 {
955 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
956 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
957 print FAILLOG " (rtf file might be too recent):\n";
958 open (ERRLOG, "$output_filestem.err");
959 while (<ERRLOG>) {print FAILLOG $_;}
960 close ERRLOG;
961 close FAILLOG;
962 }
963 &util::rm("$output_filestem.err");
964 }
965
966 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
967
968 return 0;
969}
970
971
972# Convert a pdf file to html with the pdftohtml command
973
974sub pdf_to_html {
975 my ($dirname, $input_filename, $output_filestem) = @_;
976
977 my $cmd = "";
978 if ($timeout) {$cmd = "ulimit -t $timeout;";}
979 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
980 $cmd .= " -c" if ($pdf_complex);
981 $cmd .= " -i" if ($pdf_ignore_images);
982 $cmd .= " -a" if ($pdf_allow_images_only);
983 $cmd .= " -hidden" unless ($pdf_nohidden);
984 $cmd .= " \"$input_filename\" \"$output_filestem\"";
985
986 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
987 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
988 } else {
989 $cmd .= " > \"$output_filestem.err\"";
990 }
991
992 $!=0;
993
994 my $retval=system($cmd);
995 if ($retval!=0)
996 {
997 print STDERR "Error executing pdftohtml.pl";
998 if ($!) {print STDERR ": $!";}
999 print STDERR "\n";
1000 }
1001
1002 # make sure the converter made something
1003 if ($retval!=0 || ! -s "$output_filestem.html")
1004 {
1005 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1006 # print out the converter's std err, if any
1007 if (-s "$output_filestem.err") {
1008 open (ERRLOG, "$output_filestem.err") || die "$!";
1009 print STDERR "pdftohtml error log:\n";
1010 while (<ERRLOG>) {
1011 print STDERR "$_";
1012 }
1013 close ERRLOG;
1014 }
1015 print STDERR "***********output filestem $output_filestem.html\n";
1016 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1017 if (-e "$output_filestem.err") {
1018 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1019 {
1020 open (ERRLOG, "$output_filestem.err");
1021 while (<ERRLOG>) {print FAILLOG $_;}
1022 close ERRLOG;
1023 close FAILLOG;
1024 }
1025 &util::rm("$output_filestem.err");
1026 }
1027 return 0;
1028 }
1029
1030 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1031 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1032 return 1;
1033}
1034
1035# Convert a pdf file to various types of image with the convert command
1036
1037sub pdfps_to_img {
1038 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1039
1040 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1041 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1042 my $result = `identify 2>&1`;
1043 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1044 #ImageMagick is not installed, thus the convert utility is not available.
1045 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1046 return 0;
1047 }
1048 }
1049
1050 my $cmd = "";
1051 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1052 $output_type =~ s/.*\_(.*)/$1/i;
1053 $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1054 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1055 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1056 } else {
1057 $cmd .= " > \"$output_filestem.err\"";
1058 }
1059
1060 # don't include path on windows (to avoid having to play about
1061 # with quoting when GSDLHOME might contain spaces) but assume
1062 # that the PATH is set up correctly
1063 $!=0;
1064 my $retval=system($cmd);
1065 if ($retval!=0)
1066 {
1067 print STDERR "Error executing pdftoimg.pl";
1068 if ($!) {print STDERR ": $!";}
1069 print STDERR "\n";
1070 }
1071
1072 #make sure the converter made something
1073 #if ($retval !=0) || ! -s "$output_filestem")
1074 if ($retval !=0)
1075 {
1076 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1077 #print out the converter's std err, if any
1078 if (-s "$output_filestem.err") {
1079 open (ERRLOG, "$output_filestem.err") || die "$!";
1080 print STDERR "pdfpstoimg error log:\n";
1081 while (<ERRLOG>) {
1082 print STDERR "$_";
1083 }
1084 close ERRLOG;
1085 }
1086 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1087 if (-e "$output_filestem.err") {
1088 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1089 {
1090 open (ERRLOG, "$output_filestem.err");
1091 while (<ERRLOG>) {print FAILLOG $_;}
1092 close ERRLOG;
1093 close FAILLOG;
1094 }
1095 &util::rm("$output_filestem.err");
1096 }
1097 return 0;
1098 }
1099 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1100 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1101 return 1;
1102}
1103
1104# Convert a PDF file to text with the pdftotext command
1105
1106sub pdf_to_text {
1107 my ($dirname, $input_filename, $output_filestem) = @_;
1108
1109 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1110
1111 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1112 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1113 } else {
1114 $cmd .= " > \"$output_filestem.err\"";
1115 }
1116
1117 if (system($cmd)!=0)
1118 {
1119 print STDERR "Error executing $cmd: $!\n";
1120 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1121 }
1122
1123 # make sure there is some extracted text.
1124 if (-e "$output_filestem.text") {
1125 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1126 binmode(EXTR_TEXT); # just in case...
1127 my $line="";
1128 my $seen_text=0;
1129 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1130 if ($line=~ m/\w/) {$seen_text=1;}
1131 }
1132 close EXTR_TEXT;
1133 if ($seen_text==0) { # no text was extracted
1134 print STDERR "Error: pdftotext found no text\n";
1135 &util::rm("$output_filestem.text");
1136 }
1137 }
1138
1139 # make sure the converter made something
1140 if (! -s "$output_filestem.text")
1141 {
1142 # print out the converters std err, if any
1143 if (-s "$output_filestem.err") {
1144 open (ERRLOG, "$output_filestem.err") || die "$!";
1145 print STDERR "pdftotext error log:\n";
1146 while (<ERRLOG>) {
1147 print STDERR "$_";
1148 }
1149 close ERRLOG;
1150 }
1151 # does this converter create a .out file?
1152 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1153 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1154 if (-e "$output_filestem.err") {
1155 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1156 {
1157 open (ERRLOG,"$output_filestem.err");
1158 while (<ERRLOG>) {print FAILLOG $_;}
1159 close ERRLOG;
1160 close FAILLOG;
1161 }
1162 &util::rm("$output_filestem.err");
1163 }
1164 return 0;
1165 }
1166 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1167 return 1;
1168}
1169
1170# Convert a PostScript document to text
1171# note - just using "ps2ascii" isn't good enough, as it
1172# returns 0 for a postscript interpreter error. ps2ascii is just
1173# a wrapper to "gs" anyway, so we use that cmd here.
1174
1175sub ps_to_text {
1176 my ($input_filename, $output_filestem) = @_;
1177
1178 my $error = "";
1179
1180 # if we're on windows we'll fall straight through without attempting
1181 # to use gs
1182 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1183 $error = "Windows does not support gs";
1184
1185 } else {
1186 my $cmd = "";
1187 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1188 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1189 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1190 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1191 $cmd .= " 2> $output_filestem.err";
1192 $!=0;
1193
1194 my $retcode=system($cmd);
1195 $retcode = $? >> 8; # see man perlfunc - system for this...
1196 # if system returns -1 | 127 (couldn't start program), look at $! for message
1197
1198 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1199 elsif (! -e "$output_filestem.text") {
1200 $error="did not create output file.\n";
1201 }
1202 else
1203 { # make sure the interpreter didn't get an error. It is technically
1204 # possible for the actual text to start with this, but....
1205 open PSOUT, "$output_filestem.text";
1206 if (<PSOUT> =~ m/^Error: (.*)/) {
1207 $error="interpreter error - \"$1\"";
1208 }
1209 close PSOUT;
1210 }
1211 }
1212
1213 if ($error ne "")
1214 {
1215 print STDERR "Warning: Error executing gs: $error\n";
1216 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1217
1218 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1219 {
1220 print FAILLOG "gs - $error\n";
1221 if (-e "$output_filestem.err") {
1222 open(ERRLOG, "$output_filestem.err");
1223 while (<ERRLOG>) {print FAILLOG $_;}
1224 close ERRLOG;
1225 }
1226 close FAILLOG;
1227 }
1228 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1229
1230
1231 # Fine then. We'll just do a lousy job by ourselves...
1232 # Based on 5-line regexp sed script found at:
1233 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1234 #
1235 print STDERR "Stripping text from postscript\n";
1236 my $errorcode=0;
1237 open (IN, "$input_filename")
1238 || ($errorcode=1, warn "Couldn't read file: $!");
1239 open (OUT, ">$output_filestem.text")
1240 || ($errorcode=1, warn "Couldn't write file: $!");
1241 if ($errorcode) {print STDERR "errors\n";return 0;}
1242
1243 my $text=""; # this is for whole .ps file...
1244 $text = join('', <IN>); # see man perlport, under "System Resources"
1245 close IN;
1246
1247 # Make sure this is a ps file...
1248 if ($text !~ m/^%!/) {
1249 print STDERR "Bad postscript header: not '%!'\n";
1250 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1251 {
1252 print FAILLOG "Bad postscript header: not '%!'\n";
1253 close FAILLOG;
1254 }
1255 return 0;
1256 }
1257
1258 # if ps has Page data, then use it to delete all stuff before it.
1259 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1260
1261 # remove all leading non-data stuff
1262 $text =~ s/^.*?\(//s;
1263
1264 # remove all newline chars for easier processing
1265 $text =~ s/\n//g;
1266
1267 # Big assumption here - assume that if any co-ordinates are
1268 # given, then we are at the end of a sentence.
1269 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1270
1271 # special characters--
1272 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1273
1274 # ? ps text formatting (eg italics?) ?
1275 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1276 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1277 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1278 # default - remove the rest
1279 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1280
1281 # attempt to add whitespace between words...
1282 # this is based purely on observation, and may be completely wrong...
1283 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1284 # eg I notice "b(" is sometimes NOT a space if preceded by a
1285 # negative number.
1286 $text =~ s/\)\d+ ?b\(/\) \( /g;
1287
1288 # change quoted braces to brackets
1289 $text =~ s/([^\\])\\\(/$1\{/g;
1290 $text =~ s/([^\\])\\\)/$1\}/g ;
1291
1292 # remove everything that is not between braces
1293 $text =~ s/\)([^\(\)])+?\(//sg ;
1294
1295 # remove any Trailer eof stuff.
1296 $text =~ s/\)[^\)]*$//sg;
1297
1298 ### ligatures have special characters...
1299 $text =~ s/\\013/ff/g;
1300 $text =~ s/\\014/fi/g;
1301 $text =~ s/\\015/fl/g;
1302 $text =~ s/\\016/ffi/g;
1303 $text =~ s/\\214/fi/g;
1304 $text =~ s/\\215/fl/g;
1305 $text =~ s/\\017/\n\* /g; # asterisk?
1306 $text =~ s/\\023/\023/g; # e acute ('e)
1307 $text =~ s/\\177/\252/g; # u"
1308# $text =~ s/ ?? /\344/g; # a"
1309
1310 print OUT "$text";
1311 close OUT;
1312 }
1313 # wrap the text - use a minimum length. ie, first space after this length.
1314 my $wrap_length=72;
1315 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1316 open INFILE, "$output_filestem.text.tmp" ||
1317 die "Couldn't open file: $!";
1318 open OUTFILE, ">$output_filestem.text" ||
1319 die "Couldn't open file for writing: $!";
1320 my $line="";
1321 while ($line=<INFILE>) {
1322 while (length($line)>0) {
1323 if (length($line)>$wrap_length) {
1324 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1325 print OUTFILE "$1\n";
1326 } else {
1327 print OUTFILE "$line";
1328 $line="";
1329 }
1330 }
1331 }
1332 close INFILE;
1333 close OUTFILE;
1334 &util::rm("$output_filestem.text.tmp");
1335
1336 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1337 return 1;
1338}
1339
1340
1341# Convert any file to HTML with a crude perl implementation of the
1342# UNIX strings command.
1343
1344sub any_to_html {
1345 my ($input_filename, $output_filestem) = @_;
1346
1347 # First generate a text file
1348 return 0 unless (&any_to_text($input_filename, $output_filestem));
1349
1350 # create an HTML file from the text file
1351 open(TEXT, "<$output_filestem.text");
1352 open(HTML, ">$output_filestem.html");
1353
1354 print HTML "<html><head>\n";
1355 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1356 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1357 print HTML "</head><body>\n\n";
1358
1359 my $line;
1360 while ($line=<TEXT>) {
1361 $line =~ s/</&lt;/g;
1362 $line =~ s/>/&gt;/g;
1363 if ($line =~ m/^\s*$/) {
1364 print HTML "<p>";
1365 } else {
1366 print HTML "<br> ", $line;
1367 }
1368 }
1369 print HTML "\n</body></html>\n";
1370
1371 close HTML;
1372 close TEXT;
1373
1374 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1375 return 1;
1376}
1377
1378# Convert any file to TEXT with a crude perl implementation of the
1379# UNIX strings command.
1380# Note - this assumes ascii charsets :( (jrm21)
1381
1382sub any_to_text {
1383 my ($input_filename, $output_filestem) = @_;
1384
1385 if (!$use_strings) {
1386 return 0;
1387 }
1388
1389 print STDERR "\n**** In any to text****\n\n";
1390 open(IN, "<$input_filename") || return 0;
1391 binmode(IN);
1392 open(OUT, ">$output_filestem.text") || return 0;
1393
1394 my ($line);
1395 my $output_line_count = 0;
1396 while (<IN>) {
1397 $line = $_;
1398
1399 # delete anything that isn't a printable character
1400 $line =~ s/[^\040-\176]+/\n/sg;
1401
1402 # delete any string less than 10 characters long
1403 $line =~ s/^.{0,9}$/\n/mg;
1404 while ($line =~ m/^.{1,9}$/m) {
1405 $line =~ s/^.{0,9}$/\n/mg;
1406 $line =~ s/\n+/\n/sg;
1407 }
1408
1409 # remove extraneous whitespace
1410 $line =~ s/\n+/\n/gs;
1411 $line =~ s/^\n//gs;
1412
1413 # output whatever is left
1414 if ($line =~ m/[^\n ]/) {
1415 print OUT $line;
1416 ++$output_line_count;
1417 }
1418 }
1419
1420 close OUT;
1421 close IN;
1422
1423 if ($output_line_count) { # try to protect against binary only formats
1424 return 1;
1425 }
1426
1427 &util::rm("$output_filestem.text");
1428 return 0;
1429
1430}
Note: See TracBrowser for help on using the repository browser.