source: main/trunk/greenstone2/bin/script/wvware.pl@ 24599

Last change on this file since 24599 was 24599, checked in by ak19, 13 years ago

Tidied starting comments.

  • Property svn:executable set to *
File size: 16.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2009 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28# wvware.pl: Script to set the environment for wvware and then run it
29# Setting the env vars necessary for wvware here locally, won't interfere
30# with the normal environment if they had been set in setup.bash/setup.bat
31
32
33BEGIN {
34 die "GSDLHOME not set - run the (gs3-)setup script\n" unless defined $ENV{'GSDLHOME'};
35 die "GSDLOS not set - run (gs3-)setup script\n" unless defined $ENV{'GSDLOS'};
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39
40use strict;
41use util;
42
43# Are we running on WinNT or Win2000 (or later)?
44my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
45if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
46
47sub main
48{
49 my ($argc,@argv) = @_;
50
51 if (($argc<2 || $argc>5) || (($argc==1) && ($argv[0] =~ m/^--?h(elp)?$/))) {
52 my ($progname) = ($0 =~ m/^.*[\/|\\](.*?)$/);
53
54 print STDERR "\n";
55 print STDERR "Usage: $progname <input-filename> <output-filestem> [<fail-log-file>] [<verbosity>] [<timeout>]\n";
56 print STDERR "\n";
57
58 exit(-1);
59 }
60
61 my $input_filename = $argv[0];
62 my $output_filestem = $argv[1];
63 my $faillogfile="";
64 my $verbosity=0;
65 my $timeout=0;
66
67 if($argc >= 3) {
68 $faillogfile= $argv[2];
69 }
70 if($argc >= 4) {
71 $verbosity = $argv[3];
72 }
73 if($argc >= 5) {
74 $timeout = $argv[4];
75 }
76
77 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
78
79 if (!defined $ENV{'GEXTGNOME'}) {
80 # my $extdesc = "the GNOME support library extension";
81
82 my $extdir = &util::filename_cat($ENV{'GSDLHOME'},"ext");
83 my $gnome_dir = &util::filename_cat($extdir, "gnome-lib-minimal");
84 if(-d $gnome_dir) {
85 $ENV{'GEXTGNOME'} = $gnome_dir;
86 } else {
87 $gnome_dir = &util::filename_cat($extdir, "gnome-lib");
88 if(-d $gnome_dir) {
89 $ENV{'GEXTGNOME'} = $gnome_dir;
90 } elsif ($verbosity > 2) {
91 print STDERR "No gnome-lib(-minimal) ext folder detected. Trying to run wvware without its libraries....\n";
92 }
93 }
94
95 # now set other the related env vars,
96 # IF we've found the gnome-lib dir installed in the ext folder
97
98 if (defined $ENV{'GEXTGNOME'}) {
99 $ENV{'GEXTGNOME_INSTALLED'}=&util::filename_cat($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
100
101 &util::envvar_prepend("PATH", &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "bin"));
102
103 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
104 my $gextlib = &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "lib");
105 if($ENV{'GSDLOS'} eq "linux") {
106 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
107 } elsif ($ENV{'GSDLOS'} eq "darwin") {
108 &util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
109 }
110 }
111
112 # Above largely mimics the setup.bash of the gnome-lib-minimal.
113 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
114 # Not exporting GSDLEXTS variable either
115 }
116
117# print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
118# print STDERR "\tPATH".$ENV{'PATH'}."\n\tLD_PATH".$ENV{'LD_LIBRARY_PATH'}."\n";
119
120
121 # if no GEXTGNOME, maybe they do not need gnome-lib to run wvware
122 # RUN WVWARE
123
124 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
125
126 my $wvware_folder = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv");
127 if ( -d $wvware_folder && $ENV{'GSDLOS'} eq "linux" ) {
128 &util::envvar_prepend("PATH", &util::filename_cat($wvware_folder, "bin"));
129
130 my $wvwarelib = &util::filename_cat($wvware_folder, "lib");
131 if($ENV{'GSDLOS'} eq "linux") {
132 &util::envvar_prepend("LD_LIBRARY_PATH", $wvwarelib);
133 } #else if ($ENV{'GSDLOS'} eq "darwin") {
134 # &util::envvar_prepend("DYLD_LIBRARY_PATH", $wvwarelib);
135 #}
136 $wvWare = &util::filename_cat($wvware_folder, "bin", "wvWare");
137 }
138
139 # don't include path on windows (to avoid having to play about
140 # with quoting when GSDLHOME might contain spaces) but assume
141 # that the PATH is set up correctly
142 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
143
144 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
145 "packages", "wv", "wvHtml.xml");
146
147 # Added the following to work with replace_srcdoc_with_html.pl:
148 # Make wvWare put any associated (image) files of the word doc into
149 # folder docname-without-extention_files. This folder should be at
150 # the same level as the html file generated from the doc.
151 # wvWare will take care of proper interlinking.
152
153 # This step is necessary for replace_srcdoc_with_html.pl which will
154 # move the html and associated files into the import folder. We
155 # want to ensure that the associated files won't overwrite similarly
156 # named items already in import. Hence we put them in a folder first
157 # (to which the html links properly) and that will allow
158 # replace_srcdoc_with_html.pl to move them safely to /import.
159
160 # To do all this, we need to use wvWare's --dir and --basename options
161 # where dir is the full path to the image folder directory and
162 # basename is the full path to the image folder appended to the name
163 # which is to be prepended to every image file:
164 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
165 # then the basename is "/full/path/to/imgdir/sample".
166 # In this case, basename is the full path to and name of the document.
167 # HOWEVER: basename always takes full path, not relative url, so
168 # the greenstone browser is unable to display the images (absolute paths
169 # cause it to give an "external link" message)
170 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
171 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
172 # "added --dir option to wvHtml so that pictures can be placed in
173 # a seperate directory"
174 # "running wvWare through IMP to view word documents as html. It gets
175 # invoked like this:
176 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
177
178 # toppath is the folder where html is generated
179 # docname is the name (without extension) of the html to be generated
180 # suffix (extension) is thrown away
181 my ($docname, $toppath)
182 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
183
184 # We want the image folder generated to have the same name as windows
185 # would generate ($windows_scripting) when it converts from word to html.
186 # That is, foldername=docname_files
187 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
188 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
189
190 # ensure this image directory exists
191 # if it exists already, just delete and recreate
192 if(-e $assoc_dir) {
193 &util::rm_r($assoc_dir);
194 }
195 &util::mk_dir($assoc_dir);
196
197 # the images are all going to be called image0, image1,..., imageN
198 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
199
200 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
201 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
202 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
203
204 my $cmd = "";
205
206 if ($timeout) {$cmd = "ulimit -t $timeout;";}
207 # wvWare's --dir and --basename options for image directory.
208 # Replaced the next line with the *2 lines* following it:
209 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
210 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
211 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
212 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
213
214 # redirecting STDERR is a bad idea on windows 95/98
215 $cmd .= " 2> \"$output_filestem.err\""
216 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
217
218# print STDERR "***** wvware.pl launching wvware with CMD:\n\t$cmd\n";
219
220 # execute the command
221 $!=0;
222 if (system($cmd)!=0)
223 {
224 print STDERR "Error executing wv converter:|$!|\n";
225 if (-s "$output_filestem.err") {
226 open (ERRFILE, "<$output_filestem.err");
227
228 my $write_to_fail_log=0;
229 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
230 {$write_to_fail_log=1;}
231
232 my $line;
233 while ($line=<ERRFILE>) {
234 if ($line =~ m/\w/) {
235 print STDERR "$line";
236 print FAILLOG "$line" if ($write_to_fail_log);
237 }
238 if ($line !~ m/startup error/) {next;}
239 print STDERR " (given an invalid .DOC file?)\n";
240 print FAILLOG " (given an invalid .DOC file?)\n"
241 if ($write_to_fail_log);
242
243 } # while ERRFILE
244 close FAILLOG if ($write_to_fail_log);
245 }
246 exit(0); # we can try any_to_text
247 }
248
249 # Was the conversion successful?
250
251 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
252 open(TMP, "$output_filestem.html");
253 my $line = <TMP>;
254 close(TMP);
255 if ($line && $line =~ m/DOCTYPE HTML/) {
256 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
257
258 # Inserted this code to remove the images directory if it was still empty after
259 # the html was generated (in case there were no images in the word document)
260 if (&util::is_dir_empty($assoc_dir)) {
261 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
262 &util::rm_r($assoc_dir);
263 } else { # there was an image folder (it was generated)
264 # Therefore, the html file generated contains absolute links to the images
265 # Replace them with relative links instead, so the folder can be moved elsewhere
266 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
267 }
268 exit(1);
269 }
270 }
271
272 # If here, an error of some sort occurred
273 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
274 if (-e "$output_filestem.err") {
275 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
276 open (ERRLOG,"$output_filestem.err");
277 while (<ERRLOG>) {print FAILLOG $_;}
278 close FAILLOG;
279 close ERRLOG;
280 }
281 &util::rm("$output_filestem.err");
282 }
283
284 exit(0);
285}
286
287&main(scalar(@ARGV),@ARGV);
288
289
290# Method to work with doc_to_html - Word docs might contain images.
291# When such word docs are converted with wvWare, we make it generate a
292# <filename>_files folder with the associated images, while the html file
293# <filename> refers to the images using absolute paths to <filename>_files.
294# This method reads in that html file and replaces all the absolute paths to
295# the images in <filename>_files with the relative paths to the images from
296# that folder. (I.e. with <filename>_files/<imagename.ext>).
297sub make_links_to_assocdir_relative{
298 # toppath is the top-level folder in which the html file we're going to be fixing resides
299 # docname is just the name (without extension) of the html file
300 # html_file is the full path to the html file: /full/path/docname.html
301 # assoc_dir_path is toppath/docname_files
302 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
303 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
304
305 # 1. Read all the contents of the html into a string
306 # open the original file for reading
307 unless(open(FIN, "<$html_file")) {
308 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
309 return 0;
310 }
311 # From http://perl.plover.com/local.html
312 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
313 # (Some people call this slurping the file.) Perl has a special feature to support this:
314 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
315 my $html_contents;
316 {
317 local $/ = undef; # Read entire file at once
318 $html_contents = <FIN>; # Now file is read in as one single 'line'
319 }
320 close(FIN); # close the file
321 #print STDERR $html_contents;
322
323 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
324 # values with assoc_dirname
325 # At the end: g means substitute all occurrences (global), while s at the end means treat
326 # all new lines as a regular space. This interacts with g to consider all the lines
327 # together as a single line so that multi-occurrences can be replaced.
328
329 # we can't just replace $assoc_dir_path with $assoc_dir
330 # $assoc_dir_path represents a regular expression that needs to be replaced
331 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
332 # meaning in Perl regular expressions -- we need to escape these first
333 my $safe_reg_expression = $assoc_dir_path;
334 $safe_reg_expression =~ s/\\/\\\\/g;
335 $safe_reg_expression =~ s/\./\\./g;
336 $safe_reg_expression =~ s/\-/\\-/g;
337 $safe_reg_expression =~ s/\[/\\[/g;
338 $safe_reg_expression =~ s/\]/\\]/g;
339 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
340
341 # The following regular expression substitution looks for <a or <image, followed by any other
342 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
343 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
344 # followed by characters (for the img filename), then finally the optional closing quotes
345 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
346 # The substitution: all the parts preceding associated folder's pathname are retained,
347 # the associated folder path name is replaced by associated folder directory name
348 # and the rest upto and including the closing > tag is retained.
349 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
350 # and performs a global replace (g) meaning that all occurrences that match in that single line
351 # are substituted.
352 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
353 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
354 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
355 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
356
357 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
358 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
359
360 # delete the original file and recreate it
361 my $copy_of_filename = $html_file;
362 &util::rm($copy_of_filename); # deleted the file
363
364 # Recreate the original file for writing the updated contents
365 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
366 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
367 return 0;
368 }
369
370 # write out the updated contents and close the file
371 print FOUT $html_contents;
372 close(FOUT);
373 return 1;
374}
375
376
377# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
378# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
379# introduced in link pathnames by wvWare into space again. Converts all percent signs
380# introduced by URL encoding filenames generated into %25 in these url links referencing them
381sub post_process_assocfile_urls
382{
383 my ($pre, $text, $post) = @_;
384
385 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
386 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
387 $text =~ s/\\/\//g;
388 $text =~ s/%/%25/g;
389
390 return "$pre$text$post";
391}
Note: See TracBrowser for help on using the repository browser.