Changeset 24371


Ignore:
Timestamp:
2011-08-08T21:10:36+12:00 (11 years ago)
Author:
ak19
Message:

Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.

Location:
main/trunk/greenstone2/bin/script
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r24362 r24371  
    520520    my ($input_filename, $output_filestem) = @_;
    521521
    522     my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
    523 
    524     if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
    525         $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
    526         $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
    527         $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
    528     }
    529 
    530     # don't include path on windows (to avoid having to play about
    531     # with quoting when GSDLHOME might contain spaces) but assume
    532     # that the PATH is set up correctly
    533     $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
    534 
    535     my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
    536                       "packages", "wv", "wvHtml.xml");
     522    my $wvware_status = 0;
    537523   
    538     # Added the following to work with replace_srcdoc_with_html.pl:
    539     # Make wvWare put any associated (image) files of the word doc into
    540     # folder docname-without-extention_files. This folder should be at
    541     # the same level as the html file generated from the doc.
    542     # wvWare will take care of proper interlinking.
    543 
    544     # This step is necessary for replace_srcdoc_with_html.pl which will
    545     # move the html and associated files into the import folder. We
    546     # want to ensure that the associated files won't overwrite similarly
    547     # named items already in import. Hence we put them in a folder first
    548     # (to which the html links properly) and that will allow
    549     # replace_srcdoc_with_html.pl to move them safely to /import.
    550 
    551     # To do all this, we need to use wvWare's --dir and --basename options
    552     # where dir is the full path to the image folder directory and
    553     # basename is the full path to the image folder appended to the name
    554     # which is to be prepended to every image file:
    555     # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
    556     # then the basename is "/full/path/to/imgdir/sample".
    557     # In this case, basename is the full path to and name of the document.
    558     # HOWEVER: basename always takes full path, not relative url, so
    559     # the greenstone browser is unable to display the images (absolute paths
    560     # cause it to give an "external link" message)
    561     # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
    562     # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
    563     # "added --dir option to wvHtml so that pictures can be placed in
    564     # a seperate directory"
    565     # "running wvWare through IMP to view word documents as html. It gets
    566     # invoked like this:
    567     # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
    568    
    569     # toppath is the folder where html is generated
    570     # docname is the name (without extension) of the html to be generated
    571     # suffix (extension) is thrown away
    572     my ($docname, $toppath)
    573     = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
    574 
    575     # We want the image folder generated to have the same name as windows
    576     # would generate ($windows_scripting) when it converts from word to html.
    577     # That is, foldername=docname_files
    578     my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
    579     #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
    580    
    581     # ensure this image directory exists
    582     # if it exists already, just delete and recreate
    583     if(-e $assoc_dir) {
    584     &util::rm_r($assoc_dir);
    585     } 
    586     &util::mk_dir($assoc_dir);
    587 
    588     # the images are all going to be called image0, image1,..., imageN
    589     my $img_basenames = &util::filename_cat($assoc_dir, $docname);
    590    
    591     #print STDERR "****toppath: $toppath\n****docname: $docname\n;
    592     #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
    593     #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
    594 
    595     my $cmd = "";
    596     if ($timeout) {$cmd = "ulimit -t $timeout;";}
    597     # wvWare's --dir and --basename options for image directory.
    598     # Replaced the next line with the *2 lines* following it:
    599                # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
    600     $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
    601     $cmd .= " --charset utf-8 --config \"$wv_conf\"";
    602     $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
    603 
    604     # redirecting STDERR is a bad idea on windows 95/98
    605     $cmd .= " 2> \"$output_filestem.err\""
    606     if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    607     # execute the command
    608     $!=0;
    609     if (system($cmd)!=0)
    610     {
    611     print STDERR "Error executing wv converter:$!\n";
    612     if (-s "$output_filestem.err") {
    613         open (ERRFILE, "<$output_filestem.err");
    614 
    615         my $write_to_fail_log=0;
    616         if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
    617         {$write_to_fail_log=1;}
    618 
    619         my $line;
    620         while ($line=<ERRFILE>) {
    621         if ($line =~ m/\w/) {
    622             print STDERR "$line";
    623             print FAILLOG "$line" if ($write_to_fail_log);
    624         }
    625         if ($line !~ m/startup error/) {next;}
    626         print STDERR " (given an invalid .DOC file?)\n";
    627         print FAILLOG " (given an invalid .DOC file?)\n"
    628         if ($write_to_fail_log);
    629        
    630         } # while ERRFILE
    631         close FAILLOG if ($write_to_fail_log);
    632     }
    633     return 0; # we can try any_to_text
    634     }
    635 
    636     # Was the conversion successful?
    637 
    638     if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
    639     open(TMP, "$output_filestem.html");
    640     my $line = <TMP>;
    641     close(TMP);
    642     if ($line && $line =~ m/DOCTYPE HTML/) {
    643         &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
    644 
    645         # Inserted this code to remove the images directory if it was still empty after
    646         # the html was generated (in case there were no images in the word document)
    647         if (&util::is_dir_empty($assoc_dir)) {
    648         #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
    649         &util::rm_r($assoc_dir);
    650         } else { # there was an image folder (it was generated)
    651         # Therefore, the html file generated contains absolute links to the images
    652         # Replace them with relative links instead, so the folder can be moved elsewhere
    653         &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
    654         }
    655         return 1;
    656     }
    657     }
    658    
    659     # If here, an error of some sort occurred
    660     &util::rm("$output_filestem.html") if -e "$output_filestem.html";
    661     if (-e "$output_filestem.err") {
    662     if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
    663         open (ERRLOG,"$output_filestem.err");
    664         while (<ERRLOG>) {print FAILLOG $_;}
    665         close FAILLOG;
    666         close ERRLOG;
    667     }
    668     &util::rm("$output_filestem.err");
    669     }
    670    
    671     return 0;
    672 }
    673 
    674 # Method to work with doc_to_html - Word docs might contain images.
    675 # When such word docs are converted with wvWare, we make it generate a
    676 # <filename>_files folder with the associated images, while the html file
    677 # <filename> refers to the images using absolute paths to <filename>_files.
    678 # This method reads in that html file and replaces all the absolute paths to
    679 # the images in <filename>_files with the relative paths to the images from
    680 # that folder. (I.e. with <filename>_files/<imagename.ext>).
    681 sub make_links_to_assocdir_relative{
    682     # toppath is the top-level folder in which the html file we're going to be fixing resides
    683     # docname is just the name (without extension) of the html file
    684     # html_file is the full path to the html file: /full/path/docname.html
    685     # assoc_dir_path is toppath/docname_files
    686     # assoc_dirname is the directory name of the folder with associated imgs: docname_files
    687     my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
    688 
    689     # 1. Read all the contents of the html into a string
    690     # open the original file for reading
    691     unless(open(FIN, "<$html_file")) {
    692     print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
    693     return 0;
    694     }
    695     # From http://perl.plover.com/local.html
    696     # "It's cheaper to read the file all at once, without all the splitting and reassembling.
    697     # (Some people call this slurping the file.) Perl has a special feature to support this:
    698     # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
    699     my $html_contents;
    700     {
    701     local $/ = undef;        # Read entire file at once
    702     $html_contents = <FIN>;  # Now file is read in as one single 'line'
    703     }
    704     close(FIN); # close the file
    705     #print STDERR $html_contents;
    706    
    707     # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
    708     # values with assoc_dirname
    709     # At the end: g means substitute all occurrences (global), while s at the end means treat
    710     # all new lines as a regular space. This interacts with g to consider all the lines
    711     # together as a single line so that multi-occurrences can be replaced.
    712 
    713     # we can't just replace $assoc_dir_path with $assoc_dir
    714     # $assoc_dir_path represents a regular expression that needs to be replaced
    715     # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
    716     # meaning in Perl regular expressions -- we need to escape these first
    717     my $safe_reg_expression = $assoc_dir_path;
    718     $safe_reg_expression =~ s/\\/\\\\/g;
    719     $safe_reg_expression =~ s/\./\\./g;
    720     $safe_reg_expression =~ s/\-/\\-/g;
    721     $safe_reg_expression =~ s/\[/\\[/g;
    722     $safe_reg_expression =~ s/\]/\\]/g;
    723     $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
    724 
    725     # The following regular expression substitution looks for <a or <image, followed by any other
    726     # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
    727     # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
    728     # followed by characters (for the img filename), then finally the optional closing quotes
    729     # in " or ' form, followed by any other attributes and values until the first > to end the tag.
    730     # The substitution: all the parts preceding associated folder's pathname are retained,
    731     # the associated folder path name is replaced by associated folder directory name
    732     # and the rest upto and including the closing > tag is retained.
    733     # The sg at the end of the pattern match treats all of html_contents as a single line (s)
    734     # and performs a global replace (g) meaning that all occurrences that match in that single line
    735     # are substituted.
    736     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
    737                #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
    738     # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
    739     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
    740 
    741     #print STDERR "****assoc_dirname: $assoc_dirname***\n";
    742     #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
    743    
    744     # delete the original file and recreate it
    745     my $copy_of_filename = $html_file;
    746     &util::rm($copy_of_filename); # deleted the file
    747 
    748     # Recreate the original file for writing the updated contents
    749     unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
    750     print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
    751     return 0;
    752     }
    753 
    754     # write out the updated contents and close the file
    755     print FOUT $html_contents;
    756     close(FOUT);
    757     return 1;
    758 }
    759 
    760 # Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
    761 # url slashes (/) instead of windows-style backwards slashes, and to convert all %20
    762 # introduced in link pathnames by wvWare into space again. Converts all percent signs
    763 # introduced by URL encoding filenames generated into %25 in these url links referencing them
    764 sub post_process_assocfile_urls
    765 {
    766     my ($pre, $text, $post) = @_;
    767 
    768     $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
    769     # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
    770     $text =~ s/\\/\//g;
    771     $text =~ s/%/%25/g;
    772 
    773     return "$pre$text$post";
     524    # need to ensure that the path to perl is quoted (in case there's spaces in it)
     525    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout";   
     526
     527#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
     528
     529    $wvware_status = system($launch_cmd)/256;
     530    return $wvware_status;
    774531}
    775532
Note: See TracChangeset for help on using the changeset viewer.