source: gsdl/trunk/bin/script/gsConvert.pl@ 18282

Last change on this file since 18282 was 18282, checked in by ak19, 15 years ago

Spaces in filenames are replaced with underscores just to be on the safe side. Tested that files with spaces in their names still work when using the Remote GS server and also work in the local case (such as mp3 and wmv files).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 43.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61my $pdf_complex;
62my $pdf_nohidden;
63my $pdf_zoom;
64my $pdf_ignore_images;
65my $pdf_allow_images_only;
66my $windows_scripting;
67
68sub print_usage
69{
70 print STDERR "\n";
71 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72 print STDERR " or text using third-party programs.\n\n";
73 print STDERR " usage: $0 [options] filename\n";
74 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
75 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76 print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
77 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83 print STDERR "\t\tconverting PDF to HTML\n";
84 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86 print STDERR "\t\t-pdf_complex is set\n";
87 exit(1);
88}
89
90my $faillogfile="";
91my $timeout=0;
92
93sub main
94{
95 my (@ARGV) = @_;
96 my ($input_type,$output_type,$verbose);
97
98 # read command-line arguments
99 if (!parsargv::parse(\@ARGV,
100 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
101 '/errlog/.*/', \$faillogfile,
102 'output/(auto|html|text|pagedimage).*/', \$output_type,
103 'timeout/\d+/0',\$timeout,
104 'verbose/\d+/0', \$verbose,
105 'use_strings', \$use_strings,
106 'windows_scripting',\$windows_scripting,
107 'pdf_complex', \$pdf_complex,
108 'pdf_ignore_images', \$pdf_ignore_images,
109 'pdf_allow_images_only', \$pdf_allow_images_only,
110 'pdf_nohidden', \$pdf_nohidden,
111 'pdf_zoom/\d+/2', \$pdf_zoom
112 ))
113 {
114 print_usage();
115 }
116
117 # Make sure the input file exists and can be opened for reading
118 if (scalar(@ARGV!=1)) {
119 print_usage();
120 }
121
122 my $input_filename = $ARGV[0];
123 if (!-r $input_filename) {
124 print STDERR "Error: unable to open $input_filename for reading\n";
125 exit(1);
126 }
127
128 # Deduce filenames
129 my ($tailname,$dirname,$suffix)
130 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
131 my $output_filestem = &util::filename_cat($dirname, "$tailname");
132
133 if ($input_type eq "")
134 {
135 $input_type = lc (substr($suffix,1,length($suffix)-1));
136 }
137
138 # Change to temporary working directory
139 my $stored_dir = cwd();
140 chdir ($dirname) || die "Unable to change to directory $dirname";
141
142 # Select convert utility
143 if (!defined $input_type) {
144 print STDERR "Error: No filename extension or input type defined\n";
145 exit(1);
146 }
147 elsif ($input_type eq "doc" || $input_type eq "dot") {
148 print &convertDOC($input_filename, $output_filestem, $output_type);
149 print "\n";
150 }
151 elsif ($input_type eq "rtf") {
152 print &convertRTF($input_filename, $output_filestem, $output_type);
153 print "\n";
154 }
155 elsif ($input_type eq "pdf") {
156 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
157 print "\n";
158 }
159 elsif ($input_type eq "ps") {
160 print &convertPS($input_filename, $output_filestem, $output_type);
161 print "\n";
162 }
163 elsif ($input_type eq "ppt") {
164 print &convertPPT($input_filename, $output_filestem, $output_type);
165 print "\n";
166 }
167 elsif ($input_type eq "xls") {
168 print &convertXLS($input_filename, $output_filestem, $output_type);
169 print "\n";
170 }
171 else {
172 print STDERR "Error: Unable to convert type '$input_type'\n";
173 exit(1);
174 }
175
176 # restore to original working directory
177 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
178
179}
180
181&main(@ARGV);
182
183
184
185# Document-type conversion functions
186#
187# The following functions attempt to convert documents from their
188# input type to the specified output type. If no output type was
189# given, then they first attempt HTML, and then TEXT.
190#
191# Each returns the output type ("html" or "text") or "fail" if no
192# conversion is possible.
193
194# Convert a Microsoft word document
195
196sub convertDOC {
197 ($input_filename, $output_filestem, $output_type) = @_;
198
199 # Many .doc files are not in fact word documents!
200 my $realtype = &find_docfile_type($input_filename);
201
202 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
203 return &convertWord678($input_filename, $output_filestem, $output_type);
204 } elsif ($realtype eq "rtf") {
205 return &convertRTF($input_filename, $output_filestem, $output_type);
206 } else {
207 return &convertAnything($input_filename, $output_filestem, $output_type);
208 }
209}
210
211# Convert a Microsoft word 6/7/8 document
212
213sub convertWord678 {
214 ($input_filename, $output_filestem, $output_type) = @_;
215
216 my $success = 0;
217 if (!$output_type || ($output_type =~ m/html/i)){
218 if ($windows_scripting) {
219 $success = &native_doc_to_html($input_filename, $output_filestem);
220 }
221 else {
222 $success = &doc_to_html($input_filename, $output_filestem);
223 }
224 if ($success) {
225 return "html";
226 }
227 }
228 return &convertAnything($input_filename, $output_filestem, $output_type);
229}
230
231
232# Convert a Rich Text Format (RTF) file
233
234sub convertRTF {
235 ($input_filename, $output_filestem, $output_type) = @_;
236
237 my $success = 0;
238
239 # Attempt specialised conversion to HTML
240 if (!$output_type || ($output_type =~ m/html/i)) {
241
242 if ($windows_scripting) {
243 $success = &native_doc_to_html($input_filename, $output_filestem);
244 }
245 else {
246 $success = &rtf_to_html($input_filename, $output_filestem);
247 }
248 if ($success) {
249 return "html";
250 }
251 }
252
253# rtf is so ugly that's it's not worth running strings over.
254# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
255# return &convertAnything($input_filename, $output_filestem, $output_type);
256 return "fail";
257}
258
259
260# Convert an unidentified file
261
262sub convertAnything {
263 ($input_filename, $output_filestem, $output_type) = @_;
264
265 my $success = 0;
266
267 # Attempt simple conversion to HTML
268 if (!$output_type || ($output_type =~ m/html/i)) {
269 $success = &any_to_html($input_filename, $output_filestem);
270 if ($success) {
271 return "html";
272 }
273 }
274
275 # Convert to text
276 if (!$output_type || ($output_type =~ m/text/i)) {
277 $success = &any_to_text($input_filename, $output_filestem);
278 if ($success) {
279 return "text";
280 }
281 }
282 return "fail";
283}
284
285
286
287# Convert an Adobe PDF document
288
289sub convertPDF {
290 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
291
292 my $success = 0;
293 $output_type =~ s/.*\-(.*)/$1/i;
294 # Attempt coversion to Image
295 if ($output_type =~ m/jp?g|gif|png/i) {
296 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
297 if ($success){
298 return "item";
299 }
300 }
301
302 # Attempt conversion to HTML
303 if (!$output_type || ($output_type =~ m/html/i)) {
304 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
305 if ($success) {
306 return "html";
307 }
308 }
309
310 # Attempt conversion to TEXT
311 if (!$output_type || ($output_type =~ m/text/i)) {
312 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
313 if ($success) {
314 return "text";
315 }
316 }
317
318 return "fail";
319
320}
321
322
323# Convert an Adobe PostScript document
324
325sub convertPS {
326 ($input_filename, $output_filestem, $output_type) = @_;
327
328 my $success = 0;
329 $output_type =~ s/.*\-(.*)/$1/i;
330 # Attempt coversion to Image
331 if ($output_type =~ m/jp?g|gif|png/i) {
332 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
333 if ($success){
334 return "item";
335 }
336 }
337
338 # Attempt conversion to TEXT
339 if (!$output_type || ($output_type =~ m/text/i)) {
340 $success = &ps_to_text($input_filename, $output_filestem);
341 if ($success) {
342 return "text";
343 }
344 }
345 return "fail";
346}
347
348
349sub convertPPT {
350 my ($input_filename, $output_filestem, $output_type) = @_;
351 my $success = 0;
352
353 my $ppt_convert_type = "";
354 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
355 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
356 if ($output_type =~ m/gif/i) {
357 $ppt_convert_type = "-g";
358 } elsif ($output_type =~ m/jp?g/i){
359 $ppt_convert_type = "-j";
360 } elsif ($output_type =~ m/png/i){
361 $ppt_convert_type = "-p";
362 }
363 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
364 $ENV{'GSDLOS'}, "pptextract");
365 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
366
367 $cmd = "";
368 if ($timeout) {$cmd = "ulimit -t $timeout;";}
369 # if the converting directory has already existed
370 if (-d $output_filestem) {
371 print STDERR "**The conversion directory has existed\n";
372 return "item";
373 } else {
374 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
375 $cmd .= " 2>\"$output_filestem.err\""
376 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
377 if (system($cmd) !=0) {
378 print STDERR "Powerpoint VB Scripting convert failed\n";
379 } else {
380 return "item";
381 }
382 }
383 } elsif (!$output_type || ($output_type =~ m/html/i)) {
384 # Attempt conversion to HTML
385 #if (!$output_type || ($output_type =~ m/html/i)) {
386 # formulate the command
387 $cmd = "";
388 $cmd .= "perl -S ppttohtml.pl ";
389 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
390 $cmd .= " 2>\"$output_filestem.err\""
391 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
392
393 # execute the command
394 $!=0;
395 if (system($cmd)!=0)
396 {
397 print STDERR "Powerpoint 95/97 converter failed $!\n";
398 } else {
399 return "html";
400 }
401 }
402
403 $success = &any_to_text($input_filename, $output_filestem);
404 if ($success) {
405 return "text";
406 }
407
408 return "fail";
409}
410
411
412sub convertXLS {
413 my ($input_filename, $output_filestem, $output_type) = @_;
414
415 my $success = 0;
416
417 # Attempt conversion to HTML
418 if (!$output_type || ($output_type =~ m/html/i)) {
419 # formulate the command
420 $cmd = "";
421 $cmd .= "perl -S xlstohtml.pl ";
422 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
423 $cmd .= " 2>\"$output_filestem.err\""
424 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
425
426
427 # execute the command
428 $!=0;
429 if (system($cmd)!=0)
430 {
431 print STDERR "Excel 95/97 converter failed $!\n";
432 } else {
433 return "html";
434 }
435 }
436
437 $success = &any_to_text($input_filename, $output_filestem);
438 if ($success) {
439 return "text";
440 }
441
442 return "fail";
443}
444
445
446
447# Find the real type of a .doc file
448#
449# We seem to have a lot of files with a .doc extension that are .rtf
450# files or Word 5 files. This function attempts to tell the difference.
451sub find_docfile_type {
452 ($input_filename) = @_;
453
454 open(CHK, "<$input_filename");
455 binmode(CHK);
456 my $line = "";
457 my $first = 1;
458
459 while (<CHK>) {
460
461 $line = $_;
462
463 if ($first) {
464 # check to see if this is an rtf file
465 if ($line =~ m/^\{\\rtf/) {
466 close(CHK);
467 return "rtf";
468 }
469 $first = 0;
470 }
471
472 # is this is a word 6/7/8 document?
473 if ($line =~ m/Word\.Document\.([678])/) {
474 close(CHK);
475 return "word$1";
476 }
477
478 }
479
480 return "unknown";
481}
482
483
484# Specific type-to-type conversions
485#
486# Each of the following functions attempts to convert a document from
487# a specific format to another. If they succeed they return 1 and leave
488# the output document(s) in the appropriate place; if they fail they
489# return 0 and delete any working files.
490
491
492# Attempt to convert a word document to html with the wv program
493sub doc_to_html {
494 ($input_filename, $output_filestem) = @_;
495
496 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
497 $ENV{'GSDLOS'}, "wvWare");
498
499 # don't include path on windows (to avoid having to play about
500 # with quoting when GSDLHOME might contain spaces) but assume
501 # that the PATH is set up correctly
502 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
503
504 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
505 "packages", "wv", "wvHtml.xml");
506
507 # Added the following to work with replace_srcdoc_with_html.pl:
508 # Make wvWare put any associated (image) files of the word doc into
509 # folder docname-without-extention_files. This folder should be at
510 # the same level as the html file generated from the doc.
511 # wvWare will take care of proper interlinking.
512
513 # This step is necessary for replace_srcdoc_with_html.pl which will
514 # move the html and associated files into the import folder. We
515 # want to ensure that the associated files won't overwrite similarly
516 # named items already in import. Hence we put them in a folder first
517 # (to which the html links properly) and that will allow
518 # replace_srcdoc_with_html.pl to move them safely to /import.
519
520 # To do all this, we need to use wvWare's --dir and --basename options
521 # where dir is the full path to the image folder directory and
522 # basename is the full path to the image folder appended to the name
523 # which is to be prepended to every image file:
524 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
525 # then the basename is "/full/path/to/imgdir/sample".
526 # In this case, basename is the full path to and name of the document.
527 # HOWEVER: basename always takes full path, not relative url, so
528 # the greenstone browser is unable to display the images (absolute paths
529 # cause it to give an "external link" message)
530 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
531 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
532 # "added --dir option to wvHtml so that pictures can be placed in
533 # a seperate directory"
534 # "running wvWare through IMP to view word documents as html. It gets
535 # invoked like this:
536 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
537
538 # toppath is the folder where html is generated
539 # docname is the name (without extension) of the html to be generated
540 # suffix (extension) is thrown away
541 my ($docname, $toppath)
542 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
543
544 # We want the image folder generated to have the same name as windows
545 # would generate ($windows_scripting) when it converts from word to html.
546 # That is, foldername=docname_files
547 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
548 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
549
550 # ensure this image directory exists
551 # if it exists already, just delete and recreate
552 if(-e $assoc_dir) {
553 &util::rm_r($assoc_dir);
554 }
555 &util::mk_dir($assoc_dir);
556
557 # the images are all going to be called image0, image1,..., imageN
558 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
559
560 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
561 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
562 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
563
564 my $cmd = "";
565 if ($timeout) {$cmd = "ulimit -t $timeout;";}
566 # wvWare's --dir and --basename options for image directory.
567 # Replaced the next line with the *2 lines* following it:
568 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
569 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
570 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
571 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
572
573 # redirecting STDERR is a bad idea on windows 95/98
574 $cmd .= " 2> \"$output_filestem.err\""
575 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
576 # execute the command
577 $!=0;
578 if (system($cmd)!=0)
579 {
580 print STDERR "Error executing wv converter:$!\n";
581 if (-s "$output_filestem.err") {
582 open (ERRFILE, "<$output_filestem.err");
583
584 my $write_to_fail_log=0;
585 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
586 {$write_to_fail_log=1;}
587
588 my $line;
589 while ($line=<ERRFILE>) {
590 if ($line =~ m/\w/) {
591 print STDERR "$line";
592 print FAILLOG "$line" if ($write_to_fail_log);
593 }
594 if ($line !~ m/startup error/) {next;}
595 print STDERR " (given an invalid .DOC file?)\n";
596 print FAILLOG " (given an invalid .DOC file?)\n"
597 if ($write_to_fail_log);
598
599 } # while ERRFILE
600 close FAILLOG if ($write_to_fail_log);
601 }
602 return 0; # we can try any_to_text
603 }
604
605 # Was the conversion successful?
606
607 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
608 open(TMP, "$output_filestem.html");
609 $line = <TMP>;
610 close(TMP);
611 if ($line && $line =~ m/DOCTYPE HTML/) {
612 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
613
614 # Inserted this code to remove the images directory if it was still empty after
615 # the html was generated (in case there were no images in the word document)
616 if (&util::is_dir_empty($assoc_dir)) {
617 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
618 &util::rm_r($assoc_dir);
619 } else { # there was an image folder (it was generated)
620 # Therefore, the html file generated contains absolute links to the images
621 # Replace them with relative links instead, so the folder can be moved elsewhere
622 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
623 }
624 return 1;
625 }
626 }
627
628 # If here, an error of some sort occurred
629 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
630 if (-e "$output_filestem.err") {
631 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
632 open (ERRLOG,"$output_filestem.err");
633 while (<ERRLOG>) {print FAILLOG $_;}
634 close FAILLOG;
635 close ERRLOG;
636 }
637 &util::rm("$output_filestem.err");
638 }
639
640 return 0;
641}
642
643# Method to work with doc_to_html - Word docs might contain images.
644# When such word docs are converted with wvWare, we make it generate a
645# <filename>_files folder with the associated images, while the html file
646# <filename> refers to the images using absolute paths to <filename>_files.
647# This method reads in that html file and replaces all the absolute paths to
648# the images in <filename>_files with the relative paths to the images from
649# that folder. (I.e. with <filename>_files/<imagename.ext>).
650sub make_links_to_assocdir_relative{
651 # toppath is the top-level folder in which the html file we're going to be fixing resides
652 # docname is just the name (without extension) of the html file
653 # html_file is the full path to the html file: /full/path/docname.html
654 # assoc_dir_path is toppath/docname_files
655 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
656 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
657
658 # 1. Read all the contents of the html into a string
659 # open the original file for reading
660 unless(open(FIN, "<$html_file")) {
661 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
662 return 0;
663 }
664 # From http://perl.plover.com/local.html
665 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
666 # (Some people call this slurping the file.) Perl has a special feature to support this:
667 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
668 my $html_contents;
669 {
670 local $/ = undef; # Read entire file at once
671 $html_contents = <FIN>; # Now file is read in as one single 'line'
672 }
673 close(FIN); # close the file
674 #print STDERR $html_contents;
675
676 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
677 # values with assoc_dirname
678 # At the end: g means substitute all occurrences (global), while s at the end means treat
679 # all new lines as a regular space. This interacts with g to consider all the lines
680 # together as a single line so that multi-occurrences can be replaced.
681
682 # we can't just replace $assoc_dir_path with $assoc_dir
683 # $assoc_dir_path represents a regular expression that needs to be replaced
684 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
685 # meaning in Perl regular expressions -- we need to escape these first
686 my $safe_reg_expression = $assoc_dir_path;
687 $safe_reg_expression =~ s/\\/\\\\/g;
688 $safe_reg_expression =~ s/\./\\./g;
689 $safe_reg_expression =~ s/\-/\\-/g;
690 $safe_reg_expression =~ s/\[/\\[/g;
691 $safe_reg_expression =~ s/\]/\\]/g;
692 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
693
694 # The following regular expression substitution looks for <a or <image, followed by any other
695 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
696 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
697 # followed by characters (for the img filename), then finally the optional closing quotes
698 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
699 # The substitution: all the parts preceding associated folder's pathname are retained,
700 # the associated folder path name is replaced by associated folder directory name
701 # and the rest upto and including the closing > tag is retained.
702 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
703 # and performs a global replace (g) meaning that all occurrences that match in that single line
704 # are substituted.
705 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
706 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
707 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
708 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
709
710 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
711 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
712
713 # delete the original file and recreate it
714 my $copy_of_filename = $html_file;
715 &util::rm($copy_of_filename); # deleted the file
716
717 # Recreate the original file for writing the updated contents
718 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
719 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
720 return 0;
721 }
722
723 # write out the updated contents and close the file
724 print FOUT $html_contents;
725 close(FOUT);
726 return 1;
727}
728
729# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
730# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
731# introduced in link pathnames by wvWare into space again. Converts all percent signs
732# introduced by URL encoding filenames generated into %25 in these url links referencing them
733sub post_process_assocfile_urls
734{
735 my ($pre, $text, $post) = @_;
736
737 $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
738 $text =~ s/\\/\//g;
739 $text =~ s/%/%25/g;
740
741 return "$pre$text$post";
742}
743
744# Attempt to convert a word document to html with the word2html scripting program
745sub native_doc_to_html {
746 ($input_filename, $output_filestem) = @_;
747
748 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
749 $ENV{'GSDLOS'}, "word2html");
750
751 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
752 if (-e "$output_filestem.html") {
753 print STDERR "*** The conversion file has existed\n";
754 return 1;
755 }
756
757 my $cmd = "";
758 if ($timeout) {$cmd = "ulimit -t $timeout;";}
759 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
760 #$cmd .= "$vbScript $input_filename $output_filestem.html";
761 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
762
763 # redirecting STDERR
764 $cmd .= " 2> \"$output_filestem.err\""
765 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
766
767 # execute the command
768 $!=0;
769 if (system($cmd)!=0)
770 {
771 print STDERR "Error executing word2Html converter:$!\n";
772 if (-s "$output_filestem.err") {
773 open (ERRFILE, "<$output_filestem.err");
774
775 my $write_to_fail_log=0;
776 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
777 {$write_to_fail_log=1;}
778
779 my $line;
780 while ($line=<ERRFILE>) {
781 if ($line =~ m/\w/) {
782 print STDERR "$line";
783 print FAILLOG "$line" if ($write_to_fail_log);
784 }
785 if ($line !~ m/startup error/) {next;}
786 print STDERR " (given an invalid .DOC file?)\n";
787 print FAILLOG " (given an invalid .DOC file?)\n"
788 if ($write_to_fail_log);
789
790 } # while ERRFILE
791 close FAILLOG if ($write_to_fail_log);
792 }
793 return 0; # we can try any_to_text
794 }
795
796 # Was the conversion successful?
797 if (-s "$output_filestem.html") {
798 open(TMP, "$output_filestem.html");
799 $line = <TMP>;
800 close(TMP);
801 if ($line && $line =~ m/html/) {
802 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
803 return 1;
804 }
805 }
806
807 # If here, an error of some sort occurred
808 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
809 if (-e "$output_filestem.err") {
810 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
811 open (ERRLOG,"$output_filestem.err");
812 while (<ERRLOG>) {print FAILLOG $_;}
813 close FAILLOG;
814 close ERRLOG;
815 }
816 &util::rm("$output_filestem.err");
817 }
818 return 0;
819}
820
821# Attempt to convert an RTF document to html with rtftohtml
822
823sub rtf_to_html {
824 my ($input_filename, $output_filestem) = @_;
825
826 # formulate the command
827 $cmd = "";
828 if ($timeout) {$cmd = "ulimit -t $timeout;";}
829 $cmd .= "rtftohtml";
830 #$cmd .= "rtf-converter";
831
832 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
833
834 $cmd .= " 2>\"$output_filestem.err\""
835 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
836
837
838 # execute the command
839 $!=0;
840 if (system($cmd)!=0)
841 {
842 print STDERR "Error executing rtf converter $!\n";
843 # don't currently bother printing out error log...
844 # keep going, in case it still created an HTML file...
845 }
846
847 # Was the conversion successful?
848 my $was_successful=0;
849 if (-s "$output_filestem.html") {
850 # make sure we have some content other than header
851 open (HTML, "$output_filestem.html"); # what to do if fail?
852 my $line;
853 my $past_header=0;
854 while ($line=<HTML>) {
855
856 if ($past_header == 0) {
857 if ($line =~ m/<body>/) {$past_header=1;}
858 next;
859 }
860
861 $line =~ s/<[^>]+>//g;
862 if ($line =~ m/\w/ && $past_header) { # we found some content...
863 $was_successful=1;
864 last;
865 }
866 }
867 close HTML;
868 }
869
870 if ($was_successful) {
871 &util::rm("$output_filestem.err")
872 if (-e "$output_filestem.err");
873 # insert the (modified) table of contents, if it exists.
874 if (-e "${output_filestem}_ToC.html") {
875 &util::mv("$output_filestem.html","$output_filestem.src");
876 my $open_failed=0;
877 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
878 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
879 open HTML, ">$output_filestem.html" || ++$open_failed;
880
881 if ($open_failed) {
882 close HTMLSRC;
883 close TOC;
884 close HTML;
885 &util::mv("$output_filestem.src","$output_filestem.html");
886 return 1;
887 }
888
889 # print out header info from src html.
890 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
891 print HTML "$_";
892 }
893
894 # print out table of contents, making links relative
895 <TOC>; <TOC>; # ignore first 2 lines
896 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
897 my $line;
898 while ($line=<TOC>) {
899 $line =~ s@</body></html>$@@ ; # only last line has this
900 # make link relative
901 $line =~ s@href=\"[^\#]+@href=\"@;
902 print HTML $line;
903 }
904 close TOC;
905
906 # rest of html src
907 while (<HTMLSRC>) {
908 print HTML $_;
909 }
910 close HTMLSRC;
911 close HTML;
912
913 &util::rm("${output_filestem}_ToC.html");
914 &util::rm("${output_filestem}.src");
915 }
916 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
917 return 1; # success
918 }
919
920 if (-e "$output_filestem.err") {
921 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
922 {
923 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
924 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
925 print FAILLOG " (rtf file might be too recent):\n";
926 open (ERRLOG, "$output_filestem.err");
927 while (<ERRLOG>) {print FAILLOG $_;}
928 close ERRLOG;
929 close FAILLOG;
930 }
931 &util::rm("$output_filestem.err");
932 }
933
934 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
935
936 return 0;
937}
938
939
940# Convert a pdf file to html with the pdftohtml command
941
942sub pdf_to_html {
943 my ($dirname, $input_filename, $output_filestem) = @_;
944
945 $cmd = "";
946 if ($timeout) {$cmd = "ulimit -t $timeout;";}
947 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
948 $cmd .= " -c" if ($pdf_complex);
949 $cmd .= " -i" if ($pdf_ignore_images);
950 $cmd .= " -a" if ($pdf_allow_images_only);
951 $cmd .= " -hidden" unless ($pdf_nohidden);
952 $cmd .= " \"$input_filename\" \"$output_filestem\"";
953
954 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
955 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
956 } else {
957 $cmd .= " > \"$output_filestem.err\"";
958 }
959
960 $!=0;
961
962 my $retval=system($cmd);
963 if ($retval!=0)
964 {
965 print STDERR "Error executing pdftohtml.pl";
966 if ($!) {print STDERR ": $!";}
967 print STDERR "\n";
968 }
969
970 # make sure the converter made something
971 if ($retval!=0 || ! -s "$output_filestem.html")
972 {
973 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
974 # print out the converter's std err, if any
975 if (-s "$output_filestem.err") {
976 open (ERRLOG, "$output_filestem.err") || die "$!";
977 print STDERR "pdftohtml error log:\n";
978 while (<ERRLOG>) {
979 print STDERR "$_";
980 }
981 close ERRLOG;
982 }
983 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
984 if (-e "$output_filestem.err") {
985 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
986 {
987 open (ERRLOG, "$output_filestem.err");
988 while (<ERRLOG>) {print FAILLOG $_;}
989 close ERRLOG;
990 close FAILLOG;
991 }
992 &util::rm("$output_filestem.err");
993 }
994 return 0;
995 }
996
997 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
998 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
999 return 1;
1000}
1001
1002# Convert a pdf file to various types of image with the convert command
1003
1004sub pdfps_to_img {
1005 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1006
1007 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1008 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1009 my $result = `identify 2>&1`;
1010 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1011 #ImageMagick is not installed, thus the convert utility is not available.
1012 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1013 return 0;
1014 }
1015 }
1016
1017 $cmd = "";
1018 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1019 $output_type =~ s/.*\_(.*)/$1/i;
1020 $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1021 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1022 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1023 } else {
1024 $cmd .= " > \"$output_filestem.err\"";
1025 }
1026
1027 # don't include path on windows (to avoid having to play about
1028 # with quoting when GSDLHOME might contain spaces) but assume
1029 # that the PATH is set up correctly
1030 $!=0;
1031 my $retval=system($cmd);
1032 if ($retval!=0)
1033 {
1034 print STDERR "Error executing pdftoimg.pl";
1035 if ($!) {print STDERR ": $!";}
1036 print STDERR "\n";
1037 }
1038
1039 #make sure the converter made something
1040 #if ($retval !=0) || ! -s "$output_filestem")
1041 if ($retval !=0)
1042 {
1043 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1044 #print out the converter's std err, if any
1045 if (-s "$output_filestem.err") {
1046 open (ERRLOG, "$output_filestem.err") || die "$!";
1047 print STDERR "pdfpstoimg error log:\n";
1048 while (<ERRLOG>) {
1049 print STDERR "$_";
1050 }
1051 close ERRLOG;
1052 }
1053 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1054 if (-e "$output_filestem.err") {
1055 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1056 {
1057 open (ERRLOG, "$output_filestem.err");
1058 while (<ERRLOG>) {print FAILLOG $_;}
1059 close ERRLOG;
1060 close FAILLOG;
1061 }
1062 &util::rm("$output_filestem.err");
1063 }
1064 return 0;
1065 }
1066 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1067 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1068 return 1;
1069}
1070
1071# Convert a PDF file to text with the pdftotext command
1072
1073sub pdf_to_text {
1074 my ($dirname, $input_filename, $output_filestem) = @_;
1075
1076 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1077
1078 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1079 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1080 } else {
1081 $cmd .= " > \"$output_filestem.err\"";
1082 }
1083
1084 if (system($cmd)!=0)
1085 {
1086 print STDERR "Error executing $cmd: $!\n";
1087 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1088 }
1089
1090 # make sure there is some extracted text.
1091 if (-e "$output_filestem.text") {
1092 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1093 binmode(EXTR_TEXT); # just in case...
1094 my $line="";
1095 my $seen_text=0;
1096 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1097 if ($line=~ m/\w/) {$seen_text=1;}
1098 }
1099 close EXTR_TEXT;
1100 if ($seen_text==0) { # no text was extracted
1101 print STDERR "Error: pdftotext found no text\n";
1102 &util::rm("$output_filestem.text");
1103 }
1104 }
1105
1106 # make sure the converter made something
1107 if (! -s "$output_filestem.text")
1108 {
1109 # print out the converters std err, if any
1110 if (-s "$output_filestem.err") {
1111 open (ERRLOG, "$output_filestem.err") || die "$!";
1112 print STDERR "pdftotext error log:\n";
1113 while (<ERRLOG>) {
1114 print STDERR "$_";
1115 }
1116 close ERRLOG;
1117 }
1118 # does this converter create a .out file?
1119 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1120 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1121 if (-e "$output_filestem.err") {
1122 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1123 {
1124 open (ERRLOG,"$output_filestem.err");
1125 while (<ERRLOG>) {print FAILLOG $_;}
1126 close ERRLOG;
1127 close FAILLOG;
1128 }
1129 &util::rm("$output_filestem.err");
1130 }
1131 return 0;
1132 }
1133 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1134 return 1;
1135}
1136
1137# Convert a PostScript document to text
1138# note - just using "ps2ascii" isn't good enough, as it
1139# returns 0 for a postscript interpreter error. ps2ascii is just
1140# a wrapper to "gs" anyway, so we use that cmd here.
1141
1142sub ps_to_text {
1143 my ($input_filename, $output_filestem) = @_;
1144
1145 my $error = "";
1146
1147 # if we're on windows we'll fall straight through without attempting
1148 # to use gs
1149 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1150 $error = "Windows does not support gs";
1151
1152 } else {
1153 my $cmd = "";
1154 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1155 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1156 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1157 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1158 $cmd .= " 2> $output_filestem.err";
1159 $!=0;
1160
1161 my $retcode=system($cmd);
1162 $retcode = $? >> 8; # see man perlfunc - system for this...
1163 # if system returns -1 | 127 (couldn't start program), look at $! for message
1164
1165 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1166 elsif (! -e "$output_filestem.text") {
1167 $error="did not create output file.\n";
1168 }
1169 else
1170 { # make sure the interpreter didn't get an error. It is technically
1171 # possible for the actual text to start with this, but....
1172 open PSOUT, "$output_filestem.text";
1173 if (<PSOUT> =~ m/^Error: (.*)/) {
1174 $error="interpreter error - \"$1\"";
1175 }
1176 close PSOUT;
1177 }
1178 }
1179
1180 if ($error ne "")
1181 {
1182 print STDERR "Warning: Error executing gs: $error\n";
1183 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1184
1185 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1186 {
1187 print FAILLOG "gs - $error\n";
1188 if (-e "$output_filestem.err") {
1189 open(ERRLOG, "$output_filestem.err");
1190 while (<ERRLOG>) {print FAILLOG $_;}
1191 close ERRLOG;
1192 }
1193 close FAILLOG;
1194 }
1195 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1196
1197
1198 # Fine then. We'll just do a lousy job by ourselves...
1199 # Based on 5-line regexp sed script found at:
1200 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1201 #
1202 print STDERR "Stripping text from postscript\n";
1203 my $errorcode=0;
1204 open (IN, "$input_filename")
1205 || ($errorcode=1, warn "Couldn't read file: $!");
1206 open (OUT, ">$output_filestem.text")
1207 || ($errorcode=1, warn "Couldn't write file: $!");
1208 if ($errorcode) {print STDERR "errors\n";return 0;}
1209
1210 my $text=""; # this is for whole .ps file...
1211 $text = join('', <IN>); # see man perlport, under "System Resources"
1212 close IN;
1213
1214 # Make sure this is a ps file...
1215 if ($text !~ m/^%!/) {
1216 print STDERR "Bad postscript header: not '%!'\n";
1217 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1218 {
1219 print FAILLOG "Bad postscript header: not '%!'\n";
1220 close FAILLOG;
1221 }
1222 return 0;
1223 }
1224
1225 # if ps has Page data, then use it to delete all stuff before it.
1226 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1227
1228 # remove all leading non-data stuff
1229 $text =~ s/^.*?\(//s;
1230
1231 # remove all newline chars for easier processing
1232 $text =~ s/\n//g;
1233
1234 # Big assumption here - assume that if any co-ordinates are
1235 # given, then we are at the end of a sentence.
1236 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1237
1238 # special characters--
1239 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1240
1241 # ? ps text formatting (eg italics?) ?
1242 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1243 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1244 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1245 # default - remove the rest
1246 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1247
1248 # attempt to add whitespace between words...
1249 # this is based purely on observation, and may be completely wrong...
1250 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1251 # eg I notice "b(" is sometimes NOT a space if preceded by a
1252 # negative number.
1253 $text =~ s/\)\d+ ?b\(/\) \( /g;
1254
1255 # change quoted braces to brackets
1256 $text =~ s/([^\\])\\\(/$1\{/g;
1257 $text =~ s/([^\\])\\\)/$1\}/g ;
1258
1259 # remove everything that is not between braces
1260 $text =~ s/\)([^\(\)])+?\(//sg ;
1261
1262 # remove any Trailer eof stuff.
1263 $text =~ s/\)[^\)]*$//sg;
1264
1265 ### ligatures have special characters...
1266 $text =~ s/\\013/ff/g;
1267 $text =~ s/\\014/fi/g;
1268 $text =~ s/\\015/fl/g;
1269 $text =~ s/\\016/ffi/g;
1270 $text =~ s/\\214/fi/g;
1271 $text =~ s/\\215/fl/g;
1272 $text =~ s/\\017/\n\* /g; # asterisk?
1273 $text =~ s/\\023/\023/g; # e acute ('e)
1274 $text =~ s/\\177/\252/g; # u"
1275# $text =~ s/ ?? /\344/g; # a"
1276
1277 print OUT "$text";
1278 close OUT;
1279 }
1280 # wrap the text - use a minimum length. ie, first space after this length.
1281 my $wrap_length=72;
1282 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1283 open INFILE, "$output_filestem.text.tmp" ||
1284 die "Couldn't open file: $!";
1285 open OUTFILE, ">$output_filestem.text" ||
1286 die "Couldn't open file for writing: $!";
1287 my $line="";
1288 while ($line=<INFILE>) {
1289 while (length($line)>0) {
1290 if (length($line)>$wrap_length) {
1291 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1292 print OUTFILE "$1\n";
1293 } else {
1294 print OUTFILE "$line";
1295 $line="";
1296 }
1297 }
1298 }
1299 close INFILE;
1300 close OUTFILE;
1301 &util::rm("$output_filestem.text.tmp");
1302
1303 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1304 return 1;
1305}
1306
1307
1308# Convert any file to HTML with a crude perl implementation of the
1309# UNIX strings command.
1310
1311sub any_to_html {
1312 ($input_filename, $output_filestem) = @_;
1313
1314 # First generate a text file
1315 return 0 unless (&any_to_text($input_filename, $output_filestem));
1316
1317 # create an HTML file from the text file
1318 open(TEXT, "<$output_filestem.text");
1319 open(HTML, ">$output_filestem.html");
1320
1321 print HTML "<html><head>\n";
1322 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1323 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1324 print HTML "</head><body>\n\n";
1325
1326 my $line;
1327 while ($line=<TEXT>) {
1328 $line =~ s/</&lt;/g;
1329 $line =~ s/>/&gt;/g;
1330 if ($line =~ m/^\s*$/) {
1331 print HTML "<p>";
1332 } else {
1333 print HTML "<br> ", $line;
1334 }
1335 }
1336 print HTML "\n</body></html>\n";
1337
1338 close HTML;
1339 close TEXT;
1340
1341 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1342 return 1;
1343}
1344
1345# Convert any file to TEXT with a crude perl implementation of the
1346# UNIX strings command.
1347# Note - this assumes ascii charsets :( (jrm21)
1348
1349sub any_to_text {
1350 ($input_filename, $output_filestem) = @_;
1351
1352 if (!$use_strings) {
1353 return 0;
1354 }
1355
1356 print STDERR "\n**** In any to text****\n\n";
1357 open(IN, "<$input_filename") || return 0;
1358 binmode(IN);
1359 open(OUT, ">$output_filestem.text") || return 0;
1360
1361 my ($line);
1362 my $output_line_count = 0;
1363 while (<IN>) {
1364 $line = $_;
1365
1366 # delete anything that isn't a printable character
1367 $line =~ s/[^\040-\176]+/\n/sg;
1368
1369 # delete any string less than 10 characters long
1370 $line =~ s/^.{0,9}$/\n/mg;
1371 while ($line =~ m/^.{1,9}$/m) {
1372 $line =~ s/^.{0,9}$/\n/mg;
1373 $line =~ s/\n+/\n/sg;
1374 }
1375
1376 # remove extraneous whitespace
1377 $line =~ s/\n+/\n/gs;
1378 $line =~ s/^\n//gs;
1379
1380 # output whatever is left
1381 if ($line =~ m/[^\n ]/) {
1382 print OUT $line;
1383 ++$output_line_count;
1384 }
1385 }
1386
1387 close OUT;
1388 close IN;
1389
1390 if ($output_line_count) { # try to protect against binary only formats
1391 return 1;
1392 }
1393
1394 &util::rm("$output_filestem.text");
1395 return 0;
1396
1397}
Note: See TracBrowser for help on using the repository browser.