source: main/trunk/greenstone2/bin/script/wvware.pl@ 24375

Last change on this file since 24375 was 24375, checked in by ak19, 11 years ago

Added in verbosity option when launching wvware.pl, so that an unnecessary message can be suppressed at lower verbosity levels.

  • Property svn:executable set to *
File size: 16.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# incremental-rebuild.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2009 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# Script to set the environment for wvware and then run it
30# Setting the env vars necessary for wvware here locally, won't interfere
31# with the normal environment if they had been set in setup.bash/setup.bat
32
33
34BEGIN {
35 die "GSDLHOME not set - run the (gs3-)setup script\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set - run (gs3-)setup script\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38}
39
40
41use strict;
42use util;
43
44# Are we running on WinNT or Win2000 (or later)?
45my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
46if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
47
48sub main
49{
50 my ($argc,@argv) = @_;
51
52 if (($argc<2 || $argc>5) || (($argc==1) && ($argv[0] =~ m/^--?h(elp)?$/))) {
53 my ($progname) = ($0 =~ m/^.*[\/|\\](.*?)$/);
54
55 print STDERR "\n";
56 print STDERR "Usage: $progname <input-filename> <output-filestem> [<fail-log-file>] [<verbosity>] [<timeout>]\n";
57 print STDERR "\n";
58
59 exit(-1);
60 }
61
62 my $input_filename = $argv[0];
63 my $output_filestem = $argv[1];
64 my $faillogfile="";
65 my $verbosity=0;
66 my $timeout=0;
67
68 if($argc >= 3) {
69 $faillogfile= $argv[2];
70 }
71 if($argc >= 4) {
72 $verbosity = $argv[3];
73 }
74 if($argc >= 5) {
75 $timeout = $argv[4];
76 }
77
78 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
79
80 if (!defined $ENV{'GEXTGNOME'}) {
81 # my $extdesc = "the GNOME support library extension";
82
83 my $extdir = &util::filename_cat($ENV{'GSDLHOME'},"ext");
84 my $gnome_dir = &util::filename_cat($extdir, "gnome-lib-minimal");
85 if(-d $gnome_dir) {
86 $ENV{'GEXTGNOME'} = $gnome_dir;
87 } else {
88 $gnome_dir = &util::filename_cat($extdir, "gnome-lib");
89 if(-d $gnome_dir) {
90 $ENV{'GEXTGNOME'} = $gnome_dir;
91 } elsif ($verbosity > 2) {
92 print STDERR "No gnome-lib(-minimal) ext folder detected. Trying to run wvware without its libraries....\n";
93 }
94 }
95
96 # now set other the related env vars,
97 # IF we've found the gnome-lib dir installed in the ext folder
98
99 if (defined $ENV{'GEXTGNOME'}) {
100 $ENV{'GEXTGNOME_INSTALLED'}=&util::filename_cat($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
101
102 &util::envvar_prepend("PATH", &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "bin"));
103
104 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
105 my $gextlib = &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "lib");
106 if($ENV{'GSDLOS'} eq "linux") {
107 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
108 } elsif ($ENV{'GSDLOS'} eq "darwin") {
109 &util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
110 }
111 }
112
113 # Above largely mimics the setup.bash of the gnome-lib-minimal.
114 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
115 # Not exporting GSDLEXTS variable either
116 }
117
118# print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
119# print STDERR "\tPATH".$ENV{'PATH'}."\n\tLD_PATH".$ENV{'LD_LIBRARY_PATH'}."\n";
120
121
122 # if no GEXTGNOME, maybe they do not need gnome-lib to run wvware
123 # RUN WVWARE
124
125 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
126
127 my $wvware_folder = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv");
128 if ( -d $wvware_folder && $ENV{'GSDLOS'} eq "linux" ) {
129 &util::envvar_prepend("PATH", &util::filename_cat($wvware_folder, "bin"));
130
131 my $wvwarelib = &util::filename_cat($wvware_folder, "lib");
132 if($ENV{'GSDLOS'} eq "linux") {
133 &util::envvar_prepend("LD_LIBRARY_PATH", $wvwarelib);
134 } #else if ($ENV{'GSDLOS'} eq "darwin") {
135 # &util::envvar_prepend("DYLD_LIBRARY_PATH", $wvwarelib);
136 #}
137 $wvWare = &util::filename_cat($wvware_folder, "bin", "wvWare");
138 }
139
140 # don't include path on windows (to avoid having to play about
141 # with quoting when GSDLHOME might contain spaces) but assume
142 # that the PATH is set up correctly
143 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
144
145 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
146 "packages", "wv", "wvHtml.xml");
147
148 # Added the following to work with replace_srcdoc_with_html.pl:
149 # Make wvWare put any associated (image) files of the word doc into
150 # folder docname-without-extention_files. This folder should be at
151 # the same level as the html file generated from the doc.
152 # wvWare will take care of proper interlinking.
153
154 # This step is necessary for replace_srcdoc_with_html.pl which will
155 # move the html and associated files into the import folder. We
156 # want to ensure that the associated files won't overwrite similarly
157 # named items already in import. Hence we put them in a folder first
158 # (to which the html links properly) and that will allow
159 # replace_srcdoc_with_html.pl to move them safely to /import.
160
161 # To do all this, we need to use wvWare's --dir and --basename options
162 # where dir is the full path to the image folder directory and
163 # basename is the full path to the image folder appended to the name
164 # which is to be prepended to every image file:
165 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
166 # then the basename is "/full/path/to/imgdir/sample".
167 # In this case, basename is the full path to and name of the document.
168 # HOWEVER: basename always takes full path, not relative url, so
169 # the greenstone browser is unable to display the images (absolute paths
170 # cause it to give an "external link" message)
171 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
172 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
173 # "added --dir option to wvHtml so that pictures can be placed in
174 # a seperate directory"
175 # "running wvWare through IMP to view word documents as html. It gets
176 # invoked like this:
177 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
178
179 # toppath is the folder where html is generated
180 # docname is the name (without extension) of the html to be generated
181 # suffix (extension) is thrown away
182 my ($docname, $toppath)
183 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
184
185 # We want the image folder generated to have the same name as windows
186 # would generate ($windows_scripting) when it converts from word to html.
187 # That is, foldername=docname_files
188 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
189 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
190
191 # ensure this image directory exists
192 # if it exists already, just delete and recreate
193 if(-e $assoc_dir) {
194 &util::rm_r($assoc_dir);
195 }
196 &util::mk_dir($assoc_dir);
197
198 # the images are all going to be called image0, image1,..., imageN
199 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
200
201 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
202 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
203 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
204
205 my $cmd = "";
206
207 if ($timeout) {$cmd = "ulimit -t $timeout;";}
208 # wvWare's --dir and --basename options for image directory.
209 # Replaced the next line with the *2 lines* following it:
210 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
211 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
212 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
213 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
214
215 # redirecting STDERR is a bad idea on windows 95/98
216 $cmd .= " 2> \"$output_filestem.err\""
217 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
218
219# print STDERR "***** wvware.pl launching wvware with CMD:\n\t$cmd\n";
220
221 # execute the command
222 $!=0;
223 if (system($cmd)!=0)
224 {
225 print STDERR "Error executing wv converter:|$!|\n";
226 if (-s "$output_filestem.err") {
227 open (ERRFILE, "<$output_filestem.err");
228
229 my $write_to_fail_log=0;
230 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
231 {$write_to_fail_log=1;}
232
233 my $line;
234 while ($line=<ERRFILE>) {
235 if ($line =~ m/\w/) {
236 print STDERR "$line";
237 print FAILLOG "$line" if ($write_to_fail_log);
238 }
239 if ($line !~ m/startup error/) {next;}
240 print STDERR " (given an invalid .DOC file?)\n";
241 print FAILLOG " (given an invalid .DOC file?)\n"
242 if ($write_to_fail_log);
243
244 } # while ERRFILE
245 close FAILLOG if ($write_to_fail_log);
246 }
247 exit(0); # we can try any_to_text
248 }
249
250 # Was the conversion successful?
251
252 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
253 open(TMP, "$output_filestem.html");
254 my $line = <TMP>;
255 close(TMP);
256 if ($line && $line =~ m/DOCTYPE HTML/) {
257 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
258
259 # Inserted this code to remove the images directory if it was still empty after
260 # the html was generated (in case there were no images in the word document)
261 if (&util::is_dir_empty($assoc_dir)) {
262 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
263 &util::rm_r($assoc_dir);
264 } else { # there was an image folder (it was generated)
265 # Therefore, the html file generated contains absolute links to the images
266 # Replace them with relative links instead, so the folder can be moved elsewhere
267 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
268 }
269 exit(1);
270 }
271 }
272
273 # If here, an error of some sort occurred
274 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
275 if (-e "$output_filestem.err") {
276 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
277 open (ERRLOG,"$output_filestem.err");
278 while (<ERRLOG>) {print FAILLOG $_;}
279 close FAILLOG;
280 close ERRLOG;
281 }
282 &util::rm("$output_filestem.err");
283 }
284
285 exit(0);
286}
287
288&main(scalar(@ARGV),@ARGV);
289
290
291# Method to work with doc_to_html - Word docs might contain images.
292# When such word docs are converted with wvWare, we make it generate a
293# <filename>_files folder with the associated images, while the html file
294# <filename> refers to the images using absolute paths to <filename>_files.
295# This method reads in that html file and replaces all the absolute paths to
296# the images in <filename>_files with the relative paths to the images from
297# that folder. (I.e. with <filename>_files/<imagename.ext>).
298sub make_links_to_assocdir_relative{
299 # toppath is the top-level folder in which the html file we're going to be fixing resides
300 # docname is just the name (without extension) of the html file
301 # html_file is the full path to the html file: /full/path/docname.html
302 # assoc_dir_path is toppath/docname_files
303 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
304 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
305
306 # 1. Read all the contents of the html into a string
307 # open the original file for reading
308 unless(open(FIN, "<$html_file")) {
309 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
310 return 0;
311 }
312 # From http://perl.plover.com/local.html
313 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
314 # (Some people call this slurping the file.) Perl has a special feature to support this:
315 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
316 my $html_contents;
317 {
318 local $/ = undef; # Read entire file at once
319 $html_contents = <FIN>; # Now file is read in as one single 'line'
320 }
321 close(FIN); # close the file
322 #print STDERR $html_contents;
323
324 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
325 # values with assoc_dirname
326 # At the end: g means substitute all occurrences (global), while s at the end means treat
327 # all new lines as a regular space. This interacts with g to consider all the lines
328 # together as a single line so that multi-occurrences can be replaced.
329
330 # we can't just replace $assoc_dir_path with $assoc_dir
331 # $assoc_dir_path represents a regular expression that needs to be replaced
332 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
333 # meaning in Perl regular expressions -- we need to escape these first
334 my $safe_reg_expression = $assoc_dir_path;
335 $safe_reg_expression =~ s/\\/\\\\/g;
336 $safe_reg_expression =~ s/\./\\./g;
337 $safe_reg_expression =~ s/\-/\\-/g;
338 $safe_reg_expression =~ s/\[/\\[/g;
339 $safe_reg_expression =~ s/\]/\\]/g;
340 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
341
342 # The following regular expression substitution looks for <a or <image, followed by any other
343 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
344 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
345 # followed by characters (for the img filename), then finally the optional closing quotes
346 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
347 # The substitution: all the parts preceding associated folder's pathname are retained,
348 # the associated folder path name is replaced by associated folder directory name
349 # and the rest upto and including the closing > tag is retained.
350 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
351 # and performs a global replace (g) meaning that all occurrences that match in that single line
352 # are substituted.
353 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
354 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
355 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
356 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
357
358 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
359 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
360
361 # delete the original file and recreate it
362 my $copy_of_filename = $html_file;
363 &util::rm($copy_of_filename); # deleted the file
364
365 # Recreate the original file for writing the updated contents
366 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
367 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
368 return 0;
369 }
370
371 # write out the updated contents and close the file
372 print FOUT $html_contents;
373 close(FOUT);
374 return 1;
375}
376
377
378# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
379# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
380# introduced in link pathnames by wvWare into space again. Converts all percent signs
381# introduced by URL encoding filenames generated into %25 in these url links referencing them
382sub post_process_assocfile_urls
383{
384 my ($pre, $text, $post) = @_;
385
386 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
387 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
388 $text =~ s/\\/\//g;
389 $text =~ s/%/%25/g;
390
391 return "$pre$text$post";
392}
Note: See TracBrowser for help on using the repository browser.