source: main/trunk/greenstone2/bin/script/wvware.pl@ 24371

Last change on this file since 24371 was 24371, checked in by ak19, 13 years ago

Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.

  • Property svn:executable set to *
File size: 16.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# incremental-rebuild.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2009 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# Script to set the environment for wvware and then run it
30# Setting the env vars necessary for wvware here locally, won't interfere
31# with the normal environment if they had been set in setup.bash/setup.bat
32
33
34BEGIN {
35 die "GSDLHOME not set - run the (gs3-)setup script\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set - run (gs3-)setup script\n" unless defined $ENV{'GSDLOS'};
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
38}
39
40
41use strict;
42use util;
43
44# Are we running on WinNT or Win2000 (or later)?
45my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
46if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
47
48sub main
49{
50 my ($argc,@argv) = @_;
51
52 if (($argc<2 || $argc>4) || (($argc==1) && ($argv[0] =~ m/^--?h(elp)?$/))) {
53 my ($progname) = ($0 =~ m/^.*[\/|\\](.*?)$/);
54
55 print STDERR "\n";
56 print STDERR "Usage: $progname <input-filename> <output-filestem> [<fail-log-file>] [timeout]\n";
57 print STDERR "\n";
58
59 exit(-1);
60 }
61
62 my $input_filename = $argv[0];
63 my $output_filestem = $argv[1];
64 my $faillogfile="";
65 my $timeout=0;
66 if($argc >= 3) {
67 $faillogfile= $argv[2];
68 }
69 if($argc >= 4) {
70 $timeout = $argv[3];
71 }
72
73 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
74
75 if (!defined $ENV{'GEXTGNOME'}) {
76 # my $extdesc = "the GNOME support library extension";
77
78 my $extdir = &util::filename_cat($ENV{'GSDLHOME'},"ext");
79 my $gnome_dir = &util::filename_cat($extdir, "gnome-lib-minimal");
80 if(-d $gnome_dir) {
81 $ENV{'GEXTGNOME'} = $gnome_dir;
82 } else {
83 $gnome_dir = &util::filename_cat($extdir, "gnome-lib");
84 if(-d $gnome_dir) {
85 $ENV{'GEXTGNOME'} = $gnome_dir;
86 } else {
87 print STDERR "**** No gnome-lib(-minimal) ext folder detected.\n";
88 print STDERR "**** Trying to run wvware without its libraries....\n";
89 }
90 }
91
92 # now set other the related env vars,
93 # IF we've found the gnome-lib dir installed in the ext folder
94
95 if (defined $ENV{'GEXTGNOME'}) {
96 $ENV{'GEXTGNOME_INSTALLED'}=&util::filename_cat($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
97
98 &util::envvar_prepend("PATH", &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "bin"));
99
100 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
101 my $gextlib = &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "lib");
102 if($ENV{'GSDLOS'} eq "linux") {
103 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
104 } elsif ($ENV{'GSDLOS'} eq "darwin") {
105 &util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
106 }
107 }
108
109 # Above largely mimics the setup.bash of the gnome-lib-minimal.
110 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
111 # Not exporting GSDLEXTS variable either
112 }
113
114# print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
115# print STDERR "\tPATH".$ENV{'PATH'}."\n\tLD_PATH".$ENV{'LD_LIBRARY_PATH'}."\n";
116
117
118 # if no GEXTGNOME, maybe they do not need gnome-lib to run wvware
119 # RUN WVWARE
120
121 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
122
123 my $wvware_folder = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv");
124 if ( -d $wvware_folder && $ENV{'GSDLOS'} eq "linux" ) {
125 &util::envvar_prepend("PATH", &util::filename_cat($wvware_folder, "bin"));
126
127 my $wvwarelib = &util::filename_cat($wvware_folder, "lib");
128 if($ENV{'GSDLOS'} eq "linux") {
129 &util::envvar_prepend("LD_LIBRARY_PATH", $wvwarelib);
130 } #else if ($ENV{'GSDLOS'} eq "darwin") {
131 # &util::envvar_prepend("DYLD_LIBRARY_PATH", $wvwarelib);
132 #}
133 $wvWare = &util::filename_cat($wvware_folder, "bin", "wvWare");
134 }
135
136 # don't include path on windows (to avoid having to play about
137 # with quoting when GSDLHOME might contain spaces) but assume
138 # that the PATH is set up correctly
139 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
140
141 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
142 "packages", "wv", "wvHtml.xml");
143
144 # Added the following to work with replace_srcdoc_with_html.pl:
145 # Make wvWare put any associated (image) files of the word doc into
146 # folder docname-without-extention_files. This folder should be at
147 # the same level as the html file generated from the doc.
148 # wvWare will take care of proper interlinking.
149
150 # This step is necessary for replace_srcdoc_with_html.pl which will
151 # move the html and associated files into the import folder. We
152 # want to ensure that the associated files won't overwrite similarly
153 # named items already in import. Hence we put them in a folder first
154 # (to which the html links properly) and that will allow
155 # replace_srcdoc_with_html.pl to move them safely to /import.
156
157 # To do all this, we need to use wvWare's --dir and --basename options
158 # where dir is the full path to the image folder directory and
159 # basename is the full path to the image folder appended to the name
160 # which is to be prepended to every image file:
161 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
162 # then the basename is "/full/path/to/imgdir/sample".
163 # In this case, basename is the full path to and name of the document.
164 # HOWEVER: basename always takes full path, not relative url, so
165 # the greenstone browser is unable to display the images (absolute paths
166 # cause it to give an "external link" message)
167 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
168 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
169 # "added --dir option to wvHtml so that pictures can be placed in
170 # a seperate directory"
171 # "running wvWare through IMP to view word documents as html. It gets
172 # invoked like this:
173 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
174
175 # toppath is the folder where html is generated
176 # docname is the name (without extension) of the html to be generated
177 # suffix (extension) is thrown away
178 my ($docname, $toppath)
179 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
180
181 # We want the image folder generated to have the same name as windows
182 # would generate ($windows_scripting) when it converts from word to html.
183 # That is, foldername=docname_files
184 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
185 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
186
187 # ensure this image directory exists
188 # if it exists already, just delete and recreate
189 if(-e $assoc_dir) {
190 &util::rm_r($assoc_dir);
191 }
192 &util::mk_dir($assoc_dir);
193
194 # the images are all going to be called image0, image1,..., imageN
195 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
196
197 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
198 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
199 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
200
201 my $cmd = "";
202
203 if ($timeout) {$cmd = "ulimit -t $timeout;";}
204 # wvWare's --dir and --basename options for image directory.
205 # Replaced the next line with the *2 lines* following it:
206 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
207 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
208 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
209 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
210
211 # redirecting STDERR is a bad idea on windows 95/98
212 $cmd .= " 2> \"$output_filestem.err\""
213 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
214
215# print STDERR "***** wvware.pl launching wvware with CMD:\n\t$cmd\n";
216
217 # execute the command
218 $!=0;
219 if (system($cmd)!=0)
220 {
221 print STDERR "Error executing wv converter:|$!|\n";
222 if (-s "$output_filestem.err") {
223 open (ERRFILE, "<$output_filestem.err");
224
225 my $write_to_fail_log=0;
226 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
227 {$write_to_fail_log=1;}
228
229 my $line;
230 while ($line=<ERRFILE>) {
231 if ($line =~ m/\w/) {
232 print STDERR "$line";
233 print FAILLOG "$line" if ($write_to_fail_log);
234 }
235 if ($line !~ m/startup error/) {next;}
236 print STDERR " (given an invalid .DOC file?)\n";
237 print FAILLOG " (given an invalid .DOC file?)\n"
238 if ($write_to_fail_log);
239
240 } # while ERRFILE
241 close FAILLOG if ($write_to_fail_log);
242 }
243 exit(0); # we can try any_to_text
244 }
245
246 # Was the conversion successful?
247
248 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
249 open(TMP, "$output_filestem.html");
250 my $line = <TMP>;
251 close(TMP);
252 if ($line && $line =~ m/DOCTYPE HTML/) {
253 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
254
255 # Inserted this code to remove the images directory if it was still empty after
256 # the html was generated (in case there were no images in the word document)
257 if (&util::is_dir_empty($assoc_dir)) {
258 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
259 &util::rm_r($assoc_dir);
260 } else { # there was an image folder (it was generated)
261 # Therefore, the html file generated contains absolute links to the images
262 # Replace them with relative links instead, so the folder can be moved elsewhere
263 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
264 }
265 exit(1);
266 }
267 }
268
269 # If here, an error of some sort occurred
270 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
271 if (-e "$output_filestem.err") {
272 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
273 open (ERRLOG,"$output_filestem.err");
274 while (<ERRLOG>) {print FAILLOG $_;}
275 close FAILLOG;
276 close ERRLOG;
277 }
278 &util::rm("$output_filestem.err");
279 }
280
281 exit(0);
282}
283
284&main(scalar(@ARGV),@ARGV);
285
286
287# Method to work with doc_to_html - Word docs might contain images.
288# When such word docs are converted with wvWare, we make it generate a
289# <filename>_files folder with the associated images, while the html file
290# <filename> refers to the images using absolute paths to <filename>_files.
291# This method reads in that html file and replaces all the absolute paths to
292# the images in <filename>_files with the relative paths to the images from
293# that folder. (I.e. with <filename>_files/<imagename.ext>).
294sub make_links_to_assocdir_relative{
295 # toppath is the top-level folder in which the html file we're going to be fixing resides
296 # docname is just the name (without extension) of the html file
297 # html_file is the full path to the html file: /full/path/docname.html
298 # assoc_dir_path is toppath/docname_files
299 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
300 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
301
302 # 1. Read all the contents of the html into a string
303 # open the original file for reading
304 unless(open(FIN, "<$html_file")) {
305 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
306 return 0;
307 }
308 # From http://perl.plover.com/local.html
309 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
310 # (Some people call this slurping the file.) Perl has a special feature to support this:
311 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
312 my $html_contents;
313 {
314 local $/ = undef; # Read entire file at once
315 $html_contents = <FIN>; # Now file is read in as one single 'line'
316 }
317 close(FIN); # close the file
318 #print STDERR $html_contents;
319
320 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
321 # values with assoc_dirname
322 # At the end: g means substitute all occurrences (global), while s at the end means treat
323 # all new lines as a regular space. This interacts with g to consider all the lines
324 # together as a single line so that multi-occurrences can be replaced.
325
326 # we can't just replace $assoc_dir_path with $assoc_dir
327 # $assoc_dir_path represents a regular expression that needs to be replaced
328 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
329 # meaning in Perl regular expressions -- we need to escape these first
330 my $safe_reg_expression = $assoc_dir_path;
331 $safe_reg_expression =~ s/\\/\\\\/g;
332 $safe_reg_expression =~ s/\./\\./g;
333 $safe_reg_expression =~ s/\-/\\-/g;
334 $safe_reg_expression =~ s/\[/\\[/g;
335 $safe_reg_expression =~ s/\]/\\]/g;
336 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
337
338 # The following regular expression substitution looks for <a or <image, followed by any other
339 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
340 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
341 # followed by characters (for the img filename), then finally the optional closing quotes
342 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
343 # The substitution: all the parts preceding associated folder's pathname are retained,
344 # the associated folder path name is replaced by associated folder directory name
345 # and the rest upto and including the closing > tag is retained.
346 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
347 # and performs a global replace (g) meaning that all occurrences that match in that single line
348 # are substituted.
349 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
350 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
351 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
352 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
353
354 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
355 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
356
357 # delete the original file and recreate it
358 my $copy_of_filename = $html_file;
359 &util::rm($copy_of_filename); # deleted the file
360
361 # Recreate the original file for writing the updated contents
362 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
363 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
364 return 0;
365 }
366
367 # write out the updated contents and close the file
368 print FOUT $html_contents;
369 close(FOUT);
370 return 1;
371}
372
373
374# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
375# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
376# introduced in link pathnames by wvWare into space again. Converts all percent signs
377# introduced by URL encoding filenames generated into %25 in these url links referencing them
378sub post_process_assocfile_urls
379{
380 my ($pre, $text, $post) = @_;
381
382 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
383 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
384 $text =~ s/\\/\//g;
385 $text =~ s/%/%25/g;
386
387 return "$pre$text$post";
388}
Note: See TracBrowser for help on using the repository browser.