root/main/trunk/greenstone2/bin/script/wvware.pl @ 24829

Revision 24829, 16.5 KB (checked in by ak19, 8 years ago)

Changes to bat files and perl code to deal with brackets in (Windows) filepath. Also checked winmake.bat files to see if changes were needed there. These changes go together with the commits 24826 to 24828 for gems.bat, and commit 24820 on makegs2.bat.

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2009 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27
28# wvware.pl: Script to set the environment for wvware and then run it
29# Setting the env vars necessary for wvware here locally, won't interfere
30# with the normal environment if they had been set in setup.bash/setup.bat
31
32
33BEGIN {
34    die "GSDLHOME not set - run the (gs3-)setup script\n" unless defined $ENV{'GSDLHOME'};
35    die "GSDLOS not set - run (gs3-)setup script\n" unless defined $ENV{'GSDLOS'};
36    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37}
38
39
40use strict;
41use util;
42
43# Are we running on WinNT or Win2000 (or later)?
44my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
45if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
46
47sub main
48{
49    my ($argc,@argv) = @_;
50
51    if (($argc<2 || $argc>5)  || (($argc==1) && ($argv[0] =~ m/^--?h(elp)?$/))) {
52    my ($progname) = ($0 =~ m/^.*[\/|\\](.*?)$/);
53
54    print STDERR "\n";
55    print STDERR "Usage: $progname <input-filename> <output-filestem> [<fail-log-file>] [<verbosity>] [<timeout>]\n";
56    print STDERR "\n";
57
58    exit(-1);
59    }   
60
61    my $input_filename = $argv[0];
62    my $output_filestem = $argv[1];
63    my $faillogfile="";
64    my $verbosity=0;
65    my $timeout=0;
66   
67    if($argc >= 3) {
68    $faillogfile= $argv[2];
69    }
70    if($argc >= 4) {
71    $verbosity = $argv[3];
72    }
73    if($argc >= 5) {
74    $timeout = $argv[4];
75    }
76
77    ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
78
79    if (!defined $ENV{'GEXTGNOME'}) {
80    # my $extdesc = "the GNOME support library extension";
81
82    my $extdir = &util::filename_cat($ENV{'GSDLHOME'},"ext");
83    my $gnome_dir = &util::filename_cat($extdir, "gnome-lib-minimal");
84    if(-d $gnome_dir) {
85        $ENV{'GEXTGNOME'} = $gnome_dir;
86    } else {
87        $gnome_dir = &util::filename_cat($extdir, "gnome-lib");
88        if(-d $gnome_dir) {
89        $ENV{'GEXTGNOME'} = $gnome_dir;
90        } elsif ($verbosity > 2) {
91        print STDERR "No gnome-lib(-minimal) ext folder detected. Trying to run wvware without its libraries....\n";
92        }       
93    }
94   
95    # now set other the related env vars,
96    # IF we've found the gnome-lib dir installed in the ext folder
97
98    if (defined $ENV{'GEXTGNOME'}) {
99        $ENV{'GEXTGNOME_INSTALLED'}=&util::filename_cat($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
100       
101        &util::envvar_prepend("PATH", &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "bin"));
102       
103        # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
104        my $gextlib = &util::filename_cat($ENV{'GEXTGNOME_INSTALLED'}, "lib");
105        if($ENV{'GSDLOS'} eq "linux") {
106        &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
107        } elsif ($ENV{'GSDLOS'} eq "darwin") {
108        &util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
109        }
110    }
111   
112    # Above largely mimics the setup.bash of the gnome-lib-minimal.
113    # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
114    # Not exporting GSDLEXTS variable either
115    }
116
117#    print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
118#    print STDERR "\tPATH".$ENV{'PATH'}."\n\tLD_PATH".$ENV{'LD_LIBRARY_PATH'}."\n";
119
120
121    # if no GEXTGNOME, maybe they do not need gnome-lib to run wvware
122    # RUN WVWARE
123
124    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
125
126    my $wvware_folder = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv");
127    if ( -d $wvware_folder && $ENV{'GSDLOS'} eq "linux" ) {
128    &util::envvar_prepend("PATH", &util::filename_cat($wvware_folder, "bin"));
129
130    my $wvwarelib = &util::filename_cat($wvware_folder, "lib");
131    if($ENV{'GSDLOS'} eq "linux") {
132        &util::envvar_prepend("LD_LIBRARY_PATH", $wvwarelib);
133    } #else if ($ENV{'GSDLOS'} eq "darwin") {
134       # &util::envvar_prepend("DYLD_LIBRARY_PATH", $wvwarelib);
135    #}
136        $wvWare = &util::filename_cat($wvware_folder, "bin", "wvWare");
137    }
138
139    # don't include path on windows (to avoid having to play about
140    # with quoting when GSDLHOME might contain spaces) but assume
141    # that the PATH is set up correctly
142    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
143
144    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
145                      "packages", "wv", "wvHtml.xml");
146   
147    # Added the following to work with replace_srcdoc_with_html.pl:
148    # Make wvWare put any associated (image) files of the word doc into
149    # folder docname-without-extention_files. This folder should be at
150    # the same level as the html file generated from the doc.
151    # wvWare will take care of proper interlinking.
152
153    # This step is necessary for replace_srcdoc_with_html.pl which will
154    # move the html and associated files into the import folder. We
155    # want to ensure that the associated files won't overwrite similarly
156    # named items already in import. Hence we put them in a folder first
157    # (to which the html links properly) and that will allow
158    # replace_srcdoc_with_html.pl to move them safely to /import.
159
160    # To do all this, we need to use wvWare's --dir and --basename options
161    # where dir is the full path to the image folder directory and
162    # basename is the full path to the image folder appended to the name
163    # which is to be prepended to every image file:
164    # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
165    # then the basename is "/full/path/to/imgdir/sample".
166    # In this case, basename is the full path to and name of the document.
167    # HOWEVER: basename always takes full path, not relative url, so
168    # the greenstone browser is unable to display the images (absolute paths
169    # cause it to give an "external link" message)
170    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
171    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
172    # "added --dir option to wvHtml so that pictures can be placed in
173    # a seperate directory"
174    # "running wvWare through IMP to view word documents as html. It gets
175    # invoked like this:
176    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
177   
178    # toppath is the folder where html is generated
179    # docname is the name (without extension) of the html to be generated
180    # suffix (extension) is thrown away
181    my ($docname, $toppath)
182    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
183
184    # We want the image folder generated to have the same name as windows
185    # would generate ($windows_scripting) when it converts from word to html.
186    # That is, foldername=docname_files
187    my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
188    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
189   
190    # ensure this image directory exists
191    # if it exists already, just delete and recreate
192    if(-e $assoc_dir) {
193    &util::rm_r($assoc_dir);
194    } 
195    &util::mk_dir($assoc_dir);
196
197    # the images are all going to be called image0, image1,..., imageN
198    my $img_basenames = &util::filename_cat($assoc_dir, $docname);
199   
200    #print STDERR "****toppath: $toppath\n****docname: $docname\n;
201    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
202    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
203
204    my $cmd = "";
205   
206    if ($timeout) {$cmd = "ulimit -t $timeout;";}
207    # wvWare's --dir and --basename options for image directory.
208    # Replaced the next line with the *2 lines* following it:
209               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
210    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
211    $cmd .= " --charset utf-8 --config \"$wv_conf\"";
212    $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
213
214    # redirecting STDERR is a bad idea on windows 95/98
215    $cmd .= " 2> \"$output_filestem.err\""
216    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
217
218#    print STDERR "***** wvware.pl launching wvware with CMD:\n\t$cmd\n";
219
220    # execute the command
221    $!=0;
222    if (system($cmd)!=0)
223    {
224    print STDERR "Error executing wv converter:|$!|\n";
225    if (-s "$output_filestem.err") {
226        open (ERRFILE, "<$output_filestem.err");
227
228        my $write_to_fail_log=0;
229        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
230        {$write_to_fail_log=1;}
231
232        my $line;
233        while ($line=<ERRFILE>) {
234        if ($line =~ m/\w/) {
235            print STDERR "$line";
236            print FAILLOG "$line" if ($write_to_fail_log);
237        }
238        if ($line !~ m/startup error/) {next;}
239        print STDERR " (given an invalid .DOC file?)\n";
240        print FAILLOG " (given an invalid .DOC file?)\n"
241        if ($write_to_fail_log);
242       
243        } # while ERRFILE
244        close FAILLOG if ($write_to_fail_log);
245    }
246    exit(0); # we can try any_to_text
247    }
248
249    # Was the conversion successful?
250
251    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
252    open(TMP, "$output_filestem.html");
253    my $line = <TMP>;
254    close(TMP);
255    if ($line && $line =~ m/DOCTYPE HTML/) {
256        &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
257
258        # Inserted this code to remove the images directory if it was still empty after
259        # the html was generated (in case there were no images in the word document)
260        if (&util::is_dir_empty($assoc_dir)) {
261        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
262        &util::rm_r($assoc_dir);
263        } else { # there was an image folder (it was generated)
264        # Therefore, the html file generated contains absolute links to the images
265        # Replace them with relative links instead, so the folder can be moved elsewhere
266        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
267        }
268        exit(1);
269    }
270    }
271   
272    # If here, an error of some sort occurred
273    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
274    if (-e "$output_filestem.err") {
275    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
276        open (ERRLOG,"$output_filestem.err");
277        while (<ERRLOG>) {print FAILLOG $_;}
278        close FAILLOG;
279        close ERRLOG;
280    }
281    &util::rm("$output_filestem.err");
282    }
283   
284    exit(0);
285}
286
287&main(scalar(@ARGV),@ARGV);
288
289
290# Method to work with doc_to_html - Word docs might contain images.
291# When such word docs are converted with wvWare, we make it generate a
292# <filename>_files folder with the associated images, while the html file
293# <filename> refers to the images using absolute paths to <filename>_files.
294# This method reads in that html file and replaces all the absolute paths to
295# the images in <filename>_files with the relative paths to the images from
296# that folder. (I.e. with <filename>_files/<imagename.ext>).
297sub make_links_to_assocdir_relative{
298    # toppath is the top-level folder in which the html file we're going to be fixing resides
299    # docname is just the name (without extension) of the html file
300    # html_file is the full path to the html file: /full/path/docname.html
301    # assoc_dir_path is toppath/docname_files
302    # assoc_dirname is the directory name of the folder with associated imgs: docname_files
303    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
304
305    # 1. Read all the contents of the html into a string
306    # open the original file for reading
307    unless(open(FIN, "<$html_file")) {
308    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
309    return 0;
310    }
311    # From http://perl.plover.com/local.html
312    # "It's cheaper to read the file all at once, without all the splitting and reassembling.
313    # (Some people call this slurping the file.) Perl has a special feature to support this:
314    # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
315    my $html_contents;
316    {
317    local $/ = undef;        # Read entire file at once
318    $html_contents = <FIN>;  # Now file is read in as one single 'line'
319    }
320    close(FIN); # close the file
321    #print STDERR $html_contents;
322   
323    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
324    # values with assoc_dirname
325    # At the end: g means substitute all occurrences (global), while s at the end means treat
326    # all new lines as a regular space. This interacts with g to consider all the lines
327    # together as a single line so that multi-occurrences can be replaced.
328
329    # we can't just replace $assoc_dir_path with $assoc_dir
330    # $assoc_dir_path represents a regular expression that needs to be replaced
331    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
332    # meaning in Perl regular expressions -- we need to escape these first
333    my $safe_reg_expression = $assoc_dir_path;
334    $safe_reg_expression =~ s/\\/\\\\/g;
335    $safe_reg_expression =~ s@\(@\\(@g; # escape brackets
336    $safe_reg_expression =~ s@\)@\\)@g; # escape brackets
337    $safe_reg_expression =~ s/\./\\./g;
338    $safe_reg_expression =~ s/\-/\\-/g;
339    $safe_reg_expression =~ s/\[/\\[/g;
340    $safe_reg_expression =~ s/\]/\\]/g;
341    $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
342
343    # The following regular expression substitution looks for <a or <image, followed by any other
344    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
345    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
346    # followed by characters (for the img filename), then finally the optional closing quotes
347    # in " or ' form, followed by any other attributes and values until the first > to end the tag.
348    # The substitution: all the parts preceding associated folder's pathname are retained,
349    # the associated folder path name is replaced by associated folder directory name
350    # and the rest upto and including the closing > tag is retained.
351    # The sg at the end of the pattern match treats all of html_contents as a single line (s)
352    # and performs a global replace (g) meaning that all occurrences that match in that single line
353    # are substituted.
354    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
355               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
356    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
357    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
358
359    #print STDERR "****assoc_dirname: $assoc_dirname***\n";
360    #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
361   
362    # delete the original file and recreate it
363    my $copy_of_filename = $html_file;
364    &util::rm($copy_of_filename); # deleted the file
365
366    # Recreate the original file for writing the updated contents
367    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
368    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
369    return 0;
370    }
371
372    # write out the updated contents and close the file
373    print FOUT $html_contents;
374    close(FOUT);
375    return 1;
376}
377
378
379# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
380# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
381# introduced in link pathnames by wvWare into space again. Converts all percent signs
382# introduced by URL encoding filenames generated into %25 in these url links referencing them
383sub post_process_assocfile_urls
384{
385    my ($pre, $text, $post) = @_;
386
387    $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
388    # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
389    $text =~ s/\\/\//g;
390    $text =~ s/%/%25/g;
391
392    return "$pre$text$post";
393}
Note: See TracBrowser for help on using the browser.