Changeset 24371

Show
Ignore:
Timestamp:
08.08.2011 21:10:36 (8 years ago)
Author:
ak19
Message:

Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.

Location:
main/trunk/greenstone2/bin/script
Files:
1 added
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r24362 r24371  
    520520    my ($input_filename, $output_filestem) = @_; 
    521521 
    522     my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare"); 
    523  
    524     if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) { 
    525         $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}"; 
    526         $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}"; 
    527         $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare"); 
    528     } 
    529  
    530     # don't include path on windows (to avoid having to play about 
    531     # with quoting when GSDLHOME might contain spaces) but assume 
    532     # that the PATH is set up correctly 
    533     $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
    534  
    535     my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",  
    536                       "packages", "wv", "wvHtml.xml"); 
     522    my $wvware_status = 0; 
    537523     
    538     # Added the following to work with replace_srcdoc_with_html.pl: 
    539     # Make wvWare put any associated (image) files of the word doc into 
    540     # folder docname-without-extention_files. This folder should be at 
    541     # the same level as the html file generated from the doc.  
    542     # wvWare will take care of proper interlinking.  
    543  
    544     # This step is necessary for replace_srcdoc_with_html.pl which will  
    545     # move the html and associated files into the import folder. We 
    546     # want to ensure that the associated files won't overwrite similarly 
    547     # named items already in import. Hence we put them in a folder first 
    548     # (to which the html links properly) and that will allow 
    549     # replace_srcdoc_with_html.pl to move them safely to /import. 
    550  
    551     # To do all this, we need to use wvWare's --dir and --basename options 
    552     # where dir is the full path to the image folder directory and 
    553     # basename is the full path to the image folder appended to the name  
    554     # which is to be prepended to every image file: 
    555     # eg. if the images were to have names like sample0.jpg to sampleN.jpg, 
    556     # then the basename is "/full/path/to/imgdir/sample".  
    557     # In this case, basename is the full path to and name of the document. 
    558     # HOWEVER: basename always takes full path, not relative url, so 
    559     # the greenstone browser is unable to display the images (absolute paths 
    560     # cause it to give an "external link" message) 
    561     # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html 
    562     # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html 
    563     # "added --dir option to wvHtml so that pictures can be placed in 
    564     # a seperate directory" 
    565     # "running wvWare through IMP to view word documents as html. It gets 
    566     # invoked like this: 
    567     # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output" 
    568      
    569     # toppath is the folder where html is generated 
    570     # docname is the name (without extension) of the html to be generated 
    571     # suffix (extension) is thrown away 
    572     my ($docname, $toppath)  
    573     = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
    574  
    575     # We want the image folder generated to have the same name as windows 
    576     # would generate ($windows_scripting) when it converts from word to html. 
    577     # That is, foldername=docname_files 
    578     my $assoc_dir = &util::filename_cat($toppath, $docname."_files"); 
    579     #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files" 
    580      
    581     # ensure this image directory exists 
    582     # if it exists already, just delete and recreate 
    583     if(-e $assoc_dir) {  
    584     &util::rm_r($assoc_dir); 
    585     }   
    586     &util::mk_dir($assoc_dir); 
    587  
    588     # the images are all going to be called image0, image1,..., imageN 
    589     my $img_basenames = &util::filename_cat($assoc_dir, $docname); 
    590      
    591     #print STDERR "****toppath: $toppath\n****docname: $docname\n; 
    592     #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames); 
    593     #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir); 
    594  
    595     my $cmd = ""; 
    596     if ($timeout) {$cmd = "ulimit -t $timeout;";} 
    597     # wvWare's --dir and --basename options for image directory.  
    598     # Replaced the next line with the *2 lines* following it: 
    599                # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 
    600     $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";  
    601     $cmd .= " --charset utf-8 --config \"$wv_conf\""; 
    602     $cmd .= " \"$input_filename\" > \"$output_filestem.html\""; 
    603  
    604     # redirecting STDERR is a bad idea on windows 95/98 
    605     $cmd .= " 2> \"$output_filestem.err\"" 
    606     if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    607     # execute the command 
    608     $!=0; 
    609     if (system($cmd)!=0) 
    610     { 
    611     print STDERR "Error executing wv converter:$!\n"; 
    612     if (-s "$output_filestem.err") { 
    613         open (ERRFILE, "<$output_filestem.err"); 
    614  
    615         my $write_to_fail_log=0; 
    616         if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 
    617         {$write_to_fail_log=1;} 
    618  
    619         my $line; 
    620         while ($line=<ERRFILE>) { 
    621         if ($line =~ m/\w/) { 
    622             print STDERR "$line"; 
    623             print FAILLOG "$line" if ($write_to_fail_log); 
    624         } 
    625         if ($line !~ m/startup error/) {next;} 
    626         print STDERR " (given an invalid .DOC file?)\n"; 
    627         print FAILLOG " (given an invalid .DOC file?)\n" 
    628         if ($write_to_fail_log); 
    629          
    630         } # while ERRFILE 
    631         close FAILLOG if ($write_to_fail_log); 
    632     } 
    633     return 0; # we can try any_to_text 
    634     } 
    635  
    636     # Was the conversion successful? 
    637  
    638     if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents) 
    639     open(TMP, "$output_filestem.html"); 
    640     my $line = <TMP>; 
    641     close(TMP); 
    642     if ($line && $line =~ m/DOCTYPE HTML/) { 
    643         &util::rm("$output_filestem.err") if -e "$output_filestem.err";     
    644  
    645         # Inserted this code to remove the images directory if it was still empty after  
    646         # the html was generated (in case there were no images in the word document) 
    647         if (&util::is_dir_empty($assoc_dir)) { 
    648         #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 
    649         &util::rm_r($assoc_dir); 
    650         } else { # there was an image folder (it was generated) 
    651         # Therefore, the html file generated contains absolute links to the images 
    652         # Replace them with relative links instead, so the folder can be moved elsewhere 
    653         &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");    
    654         } 
    655         return 1; 
    656     } 
    657     } 
    658      
    659     # If here, an error of some sort occurred 
    660     &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 
    661     if (-e "$output_filestem.err") { 
    662     if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 
    663         open (ERRLOG,"$output_filestem.err"); 
    664         while (<ERRLOG>) {print FAILLOG $_;} 
    665         close FAILLOG; 
    666         close ERRLOG; 
    667     } 
    668     &util::rm("$output_filestem.err"); 
    669     } 
    670      
    671     return 0; 
    672 } 
    673  
    674 # Method to work with doc_to_html - Word docs might contain images. 
    675 # When such word docs are converted with wvWare, we make it generate a  
    676 # <filename>_files folder with the associated images, while the html file 
    677 # <filename> refers to the images using absolute paths to <filename>_files. 
    678 # This method reads in that html file and replaces all the absolute paths to  
    679 # the images in <filename>_files with the relative paths to the images from 
    680 # that folder. (I.e. with <filename>_files/<imagename.ext>). 
    681 sub make_links_to_assocdir_relative{ 
    682     # toppath is the top-level folder in which the html file we're going to be fixing resides 
    683     # docname is just the name (without extension) of the html file 
    684     # html_file is the full path to the html file: /full/path/docname.html 
    685     # assoc_dir_path is toppath/docname_files 
    686     # assoc_dirname is the directory name of the folder with associated imgs: docname_files 
    687     my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_; 
    688  
    689     # 1. Read all the contents of the html into a string 
    690     # open the original file for reading 
    691     unless(open(FIN, "<$html_file")) {  
    692     print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n"; 
    693     return 0; 
    694     } 
    695     # From http://perl.plover.com/local.html 
    696     # "It's cheaper to read the file all at once, without all the splitting and reassembling.  
    697     # (Some people call this slurping the file.) Perl has a special feature to support this:  
    698     # If the $/ variable is undefined, the <...> operator will read the entire file all at once" 
    699     my $html_contents; 
    700     { 
    701     local $/ = undef;        # Read entire file at once 
    702     $html_contents = <FIN>;  # Now file is read in as one single 'line' 
    703     } 
    704     close(FIN); # close the file 
    705     #print STDERR $html_contents; 
    706     
    707     # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src 
    708     # values with assoc_dirname 
    709     # At the end: g means substitute all occurrences (global), while s at the end means treat  
    710     # all new lines as a regular space. This interacts with g to consider all the lines  
    711     # together as a single line so that multi-occurrences can be replaced. 
    712  
    713     # we can't just replace $assoc_dir_path with $assoc_dir 
    714     # $assoc_dir_path represents a regular expression that needs to be replaced 
    715     # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special 
    716     # meaning in Perl regular expressions -- we need to escape these first 
    717     my $safe_reg_expression = $assoc_dir_path; 
    718     $safe_reg_expression =~ s/\\/\\\\/g; 
    719     $safe_reg_expression =~ s/\./\\./g; 
    720     $safe_reg_expression =~ s/\-/\\-/g; 
    721     $safe_reg_expression =~ s/\[/\\[/g; 
    722     $safe_reg_expression =~ s/\]/\\]/g; 
    723     $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match 
    724  
    725     # The following regular expression substitution looks for <a or <image, followed by any other  
    726     # attributes and values until it comes to the FIRST (indicated by ?) href= or src=  
    727     # followed by " or ' no quotes at all around path, followed by the associated folder's pathname  
    728     # followed by characters (for the img filename), then finally the optional closing quotes  
    729     # in " or ' form, followed by any other attributes and values until the first > to end the tag. 
    730     # The substitution: all the parts preceding associated folder's pathname are retained, 
    731     # the associated folder path name is replaced by associated folder directory name 
    732     # and the rest upto and including the closing > tag is retained. 
    733     # The sg at the end of the pattern match treats all of html_contents as a single line (s)  
    734     # and performs a global replace (g) meaning that all occurrences that match in that single line 
    735     # are substituted. 
    736     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg; 
    737                #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back 
    738     # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this 
    739     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge; 
    740  
    741     #print STDERR "****assoc_dirname: $assoc_dirname***\n"; 
    742     #print STDERR "****safe_reg_expression: $safe_reg_expression***\n"; 
    743     
    744     # delete the original file and recreate it 
    745     my $copy_of_filename = $html_file; 
    746     &util::rm($copy_of_filename); # deleted the file 
    747  
    748     # Recreate the original file for writing the updated contents 
    749     unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing 
    750     print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n"; 
    751     return 0; 
    752     } 
    753  
    754     # write out the updated contents and close the file 
    755     print FOUT $html_contents; 
    756     close(FOUT); 
    757     return 1; 
    758 } 
    759  
    760 # Utility routine to make sure HTML plugin gets img src/href link pathnames that contain  
    761 # url slashes (/) instead of windows-style backwards slashes, and to convert all %20  
    762 # introduced in link pathnames by wvWare into space again. Converts all percent signs 
    763 # introduced by URL encoding filenames generated into %25 in these url links referencing them 
    764 sub post_process_assocfile_urls 
    765 { 
    766     my ($pre, $text, $post) = @_; 
    767  
    768     $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild  
    769     # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores 
    770     $text =~ s/\\/\//g; 
    771     $text =~ s/%/%25/g; 
    772  
    773     return "$pre$text$post"; 
     524    # need to ensure that the path to perl is quoted (in case there's spaces in it) 
     525    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout";     
     526 
     527#    print STDERR "***** wvware launch cmd = $launch_cmd\n"; 
     528 
     529    $wvware_status = system($launch_cmd)/256; 
     530    return $wvware_status; 
    774531} 
    775532