source: main/trunk/greenstone2/bin/script/wvware.pl@ 30499

Last change on this file since 30499 was 27522, checked in by ak19, 11 years ago

Correcting some minor bugs during build.

  • Property svn:executable set to *
File size: 16.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2009 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28# wvware.pl: Script to set the environment for wvware and then run it
29# Setting the env vars necessary for wvware here locally, won't interfere
30# with the normal environment if they had been set in setup.bash/setup.bat
31
32
33BEGIN {
34 die "GSDLHOME not set - run the (gs3-)setup script\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set - run (gs3-)setup script\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39
40use strict;
41use util;
42use FileUtils;
43
44# Are we running on WinNT or Win2000 (or later)?
45my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
46if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
47
48sub main
49{
50 my ($argc,@argv) = @_;
51
52 if (($argc<2 || $argc>5) || (($argc==1) && ($argv[0] =~ m/^--?h(elp)?$/))) {
53 my ($progname) = ($0 =~ m/^.*[\/|\\](.*?)$/);
54
55 print STDERR "\n";
56 print STDERR "Usage: $progname <input-filename> <output-filestem> [<fail-log-file>] [<verbosity>] [<timeout>]\n";
57 print STDERR "\n";
58
59 exit(-1);
60 }
61
62 my $input_filename = $argv[0];
63 my $output_filestem = $argv[1];
64 my $faillogfile="";
65 my $verbosity=0;
66 my $timeout=0;
67
68 if($argc >= 3) {
69 $faillogfile= $argv[2];
70 }
71 if($argc >= 4) {
72 $verbosity = $argv[3];
73 }
74 if($argc >= 5) {
75 $timeout = $argv[4];
76 }
77
78 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
79
80 if (!defined $ENV{'GEXTGNOME'}) {
81 # my $extdesc = "the GNOME support library extension";
82
83 my $extdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext");
84 my $gnome_dir = &FileUtils::filenameConcatenate($extdir, "gnome-lib-minimal");
85 if(-d $gnome_dir) {
86 $ENV{'GEXTGNOME'} = $gnome_dir;
87 } else {
88 $gnome_dir = &FileUtils::filenameConcatenate($extdir, "gnome-lib");
89 if(-d $gnome_dir) {
90 $ENV{'GEXTGNOME'} = $gnome_dir;
91 } elsif ($verbosity > 2) {
92 print STDERR "No gnome-lib(-minimal) ext folder detected. Trying to run wvware without its libraries....\n";
93 }
94 }
95
96 # now set other the related env vars,
97 # IF we've found the gnome-lib dir installed in the ext folder
98
99 if (defined $ENV{'GEXTGNOME'}) {
100 $ENV{'GEXTGNOME_INSTALLED'}=&FileUtils::filenameConcatenate($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
101
102 &util::envvar_prepend("PATH", &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "bin"));
103
104 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
105 my $gextlib = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "lib");
106 if($ENV{'GSDLOS'} eq "linux") {
107 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
108 } elsif ($ENV{'GSDLOS'} eq "darwin") {
109 &util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
110 }
111 }
112
113 # Above largely mimics the setup.bash of the gnome-lib-minimal.
114 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
115 # Not exporting GSDLEXTS variable either
116 }
117
118# print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
119# print STDERR "\tPATH".$ENV{'PATH'}."\n\tLD_PATH".$ENV{'LD_LIBRARY_PATH'}."\n";
120
121
122 # if no GEXTGNOME, maybe they do not need gnome-lib to run wvware
123 # RUN WVWARE
124
125 my $wvWare = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
126
127 my $wvware_folder = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv");
128 if ( -d $wvware_folder && $ENV{'GSDLOS'} eq "linux" ) {
129 &util::envvar_prepend("PATH", &FileUtils::filenameConcatenate($wvware_folder, "bin"));
130
131 my $wvwarelib = &FileUtils::filenameConcatenate($wvware_folder, "lib");
132 if($ENV{'GSDLOS'} eq "linux") {
133 &util::envvar_prepend("LD_LIBRARY_PATH", $wvwarelib);
134 } #else if ($ENV{'GSDLOS'} eq "darwin") {
135 # &util::envvar_prepend("DYLD_LIBRARY_PATH", $wvwarelib);
136 #}
137 $wvWare = &FileUtils::filenameConcatenate($wvware_folder, "bin", "wvWare");
138 }
139
140 # don't include path on windows (to avoid having to play about
141 # with quoting when GSDLHOME might contain spaces) but assume
142 # that the PATH is set up correctly
143 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
144
145 my $wv_conf = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "etc",
146 "packages", "wv", "wvHtml.xml");
147
148 # Added the following to work with replace_srcdoc_with_html.pl:
149 # Make wvWare put any associated (image) files of the word doc into
150 # folder docname-without-extention_files. This folder should be at
151 # the same level as the html file generated from the doc.
152 # wvWare will take care of proper interlinking.
153
154 # This step is necessary for replace_srcdoc_with_html.pl which will
155 # move the html and associated files into the import folder. We
156 # want to ensure that the associated files won't overwrite similarly
157 # named items already in import. Hence we put them in a folder first
158 # (to which the html links properly) and that will allow
159 # replace_srcdoc_with_html.pl to move them safely to /import.
160
161 # To do all this, we need to use wvWare's --dir and --basename options
162 # where dir is the full path to the image folder directory and
163 # basename is the full path to the image folder appended to the name
164 # which is to be prepended to every image file:
165 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
166 # then the basename is "/full/path/to/imgdir/sample".
167 # In this case, basename is the full path to and name of the document.
168 # HOWEVER: basename always takes full path, not relative url, so
169 # the greenstone browser is unable to display the images (absolute paths
170 # cause it to give an "external link" message)
171 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
172 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
173 # "added --dir option to wvHtml so that pictures can be placed in
174 # a seperate directory"
175 # "running wvWare through IMP to view word documents as html. It gets
176 # invoked like this:
177 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
178
179 # toppath is the folder where html is generated
180 # docname is the name (without extension) of the html to be generated
181 # suffix (extension) is thrown away
182 my ($docname, $toppath)
183 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
184
185 # We want the image folder generated to have the same name as windows
186 # would generate ($windows_scripting) when it converts from word to html.
187 # That is, foldername=docname_files
188 my $assoc_dir = &FileUtils::filenameConcatenate($toppath, $docname."_files");
189 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
190
191 # ensure this image directory exists
192 # if it exists already, just delete and recreate
193 if(-e $assoc_dir) {
194 &FileUtils::removeFilesRecursive($assoc_dir);
195 }
196 &FileUtils::makeDirectory($assoc_dir);
197
198 # the images are all going to be called image0, image1,..., imageN
199 my $img_basenames = &FileUtils::filenameConcatenate($assoc_dir, $docname);
200
201 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
202 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
203 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
204
205 my $cmd = "";
206
207 if ($timeout) {$cmd = "ulimit -t $timeout;";}
208 # wvWare's --dir and --basename options for image directory.
209 # Replaced the next line with the *2 lines* following it:
210 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
211 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
212 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
213 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
214
215 # redirecting STDERR is a bad idea on windows 95/98
216 $cmd .= " 2> \"$output_filestem.err\""
217 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
218
219# print STDERR "***** wvware.pl launching wvware with CMD:\n\t$cmd\n";
220
221 # execute the command
222 $!=0;
223 if (system($cmd)!=0)
224 {
225 print STDERR "Error executing wv converter:|$!|\n";
226 if (-s "$output_filestem.err") {
227 open (ERRFILE, "<$output_filestem.err");
228
229 my $write_to_fail_log=0;
230 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
231 {$write_to_fail_log=1;}
232
233 my $line;
234 while ($line=<ERRFILE>) {
235 if ($line =~ m/\w/) {
236 print STDERR "$line";
237 print FAILLOG "$line" if ($write_to_fail_log);
238 }
239 if ($line !~ m/startup error/) {next;}
240 print STDERR " (given an invalid .DOC file?)\n";
241 print FAILLOG " (given an invalid .DOC file?)\n"
242 if ($write_to_fail_log);
243
244 } # while ERRFILE
245 close FAILLOG if ($write_to_fail_log);
246 }
247 exit(0); # we can try any_to_text
248 }
249
250 # Was the conversion successful?
251
252 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
253 open(TMP, "$output_filestem.html");
254 my $line = <TMP>;
255 close(TMP);
256 if ($line && $line =~ m/DOCTYPE HTML/) {
257 &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
258
259 # Inserted this code to remove the images directory if it was still empty after
260 # the html was generated (in case there were no images in the word document)
261 if (&FileUtils::isDirectoryEmpty($assoc_dir)) {
262 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
263 &FileUtils::removeFilesRecursive($assoc_dir);
264 } else { # there was an image folder (it was generated)
265 # Therefore, the html file generated contains absolute links to the images
266 # Replace them with relative links instead, so the folder can be moved elsewhere
267 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
268 }
269 exit(1);
270 }
271 }
272
273 # If here, an error of some sort occurred
274 &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
275 if (-e "$output_filestem.err") {
276 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
277 open (ERRLOG,"$output_filestem.err");
278 while (<ERRLOG>) {print FAILLOG $_;}
279 close FAILLOG;
280 close ERRLOG;
281 }
282 &FileUtils::removeFiles("$output_filestem.err");
283 }
284
285 exit(0);
286}
287
288&main(scalar(@ARGV),@ARGV);
289
290
291# Method to work with doc_to_html - Word docs might contain images.
292# When such word docs are converted with wvWare, we make it generate a
293# <filename>_files folder with the associated images, while the html file
294# <filename> refers to the images using absolute paths to <filename>_files.
295# This method reads in that html file and replaces all the absolute paths to
296# the images in <filename>_files with the relative paths to the images from
297# that folder. (I.e. with <filename>_files/<imagename.ext>).
298sub make_links_to_assocdir_relative{
299 # toppath is the top-level folder in which the html file we're going to be fixing resides
300 # docname is just the name (without extension) of the html file
301 # html_file is the full path to the html file: /full/path/docname.html
302 # assoc_dir_path is toppath/docname_files
303 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
304 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
305
306 # 1. Read all the contents of the html into a string
307 # open the original file for reading
308 unless(open(FIN, "<$html_file")) {
309 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
310 return 0;
311 }
312 # From http://perl.plover.com/local.html
313 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
314 # (Some people call this slurping the file.) Perl has a special feature to support this:
315 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
316 my $html_contents;
317 {
318 local $/ = undef; # Read entire file at once
319 $html_contents = <FIN>; # Now file is read in as one single 'line'
320 }
321 close(FIN); # close the file
322 #print STDERR $html_contents;
323
324 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
325 # values with assoc_dirname
326 # At the end: g means substitute all occurrences (global), while s at the end means treat
327 # all new lines as a regular space. This interacts with g to consider all the lines
328 # together as a single line so that multi-occurrences can be replaced.
329
330 # we can't just replace $assoc_dir_path with $assoc_dir
331 # $assoc_dir_path represents a regular expression that needs to be replaced
332 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
333 # meaning in Perl regular expressions -- we need to escape these first
334 my $safe_reg_expression = $assoc_dir_path;
335 $safe_reg_expression =~ s/\\/\\\\/g;
336 $safe_reg_expression =~ s@\(@\\(@g; # escape brackets
337 $safe_reg_expression =~ s@\)@\\)@g; # escape brackets
338 $safe_reg_expression =~ s/\./\\./g;
339 $safe_reg_expression =~ s/\-/\\-/g;
340 $safe_reg_expression =~ s/\[/\\[/g;
341 $safe_reg_expression =~ s/\]/\\]/g;
342 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
343
344 # The following regular expression substitution looks for <a or <image, followed by any other
345 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
346 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
347 # followed by characters (for the img filename), then finally the optional closing quotes
348 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
349 # The substitution: all the parts preceding associated folder's pathname are retained,
350 # the associated folder path name is replaced by associated folder directory name
351 # and the rest upto and including the closing > tag is retained.
352 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
353 # and performs a global replace (g) meaning that all occurrences that match in that single line
354 # are substituted.
355 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
356 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
357 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
358 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
359
360 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
361 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
362
363 # delete the original file and recreate it
364 my $copy_of_filename = $html_file;
365 &FileUtils::removeFiles($copy_of_filename); # deleted the file
366
367 # Recreate the original file for writing the updated contents
368 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
369 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
370 return 0;
371 }
372
373 # write out the updated contents and close the file
374 print FOUT $html_contents;
375 close(FOUT);
376 return 1;
377}
378
379
380# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
381# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
382# introduced in link pathnames by wvWare into space again. Converts all percent signs
383# introduced by URL encoding filenames generated into %25 in these url links referencing them
384sub post_process_assocfile_urls
385{
386 my ($pre, $text, $post) = @_;
387
388 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
389 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
390 $text =~ s/\\/\//g;
391 $text =~ s/%/%25/g;
392
393 return "$pre$text$post";
394}
Note: See TracBrowser for help on using the repository browser.