Changeset 15120

Show
Ignore:
Timestamp:
20.03.2008 21:08:15 (11 years ago)
Author:
ak19
Message:

Changes to work with new script replace_srcdoc_with_html.pl: added to subroutine doc_to_html, created new subs make_links_to_assocdir_relative and is_dir_empty

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/gsConvert.pl

    r12704 r15120  
    226226    } 
    227227    } 
    228  
    229228    return &convertAnything($input_filename, $output_filestem, $output_type); 
    230229} 
     
    506505                      "packages", "wv", "wvHtml.xml"); 
    507506     
     507    # Added the following to work with replace_srcdoc_with_html.pl: 
     508    # Make wvWare put any associated (image) files of the word doc into 
     509    # folder docname-without-extention_files. This folder should be at 
     510    # the same level as the html file generated from the doc.  
     511    # wvWare will take care of proper interlinking.  
     512 
     513    # This step is necessary for replace_srcdoc_with_html.pl which will  
     514    # move the html and associated files into the import folder. We 
     515    # want to ensure that the associated files won't overwrite similarly 
     516    # named items already in import. Hence we put them in a folder first 
     517    # (to which the html links properly) and that will allow 
     518    # replace_srcdoc_with_html.pl to move them safely to /import. 
     519 
     520    # To do all this, we need to use wvWare's --dir and --basename options 
     521    # where dir is the full path to the image folder directory and 
     522    # basename is the full path to the image folder appended to the name  
     523    # which is to be prepended to every image file: 
     524    # eg. if the images were to have names like sample0.jpg to sampleN.jpg, 
     525    # then the basename is "/full/path/to/imgdir/sample".  
     526    # In this case, basename is the full path to and name of the document. 
     527    # HOWEVER: basename always takes full path, not relative url, so 
     528    # the greenstone browser is unable to display the images (absolute paths 
     529    # cause it to give an "external link" message) 
     530    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html 
     531    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html 
     532    # "added --dir option to wvHtml so that pictures can be placed in 
     533    # a seperate directory" 
     534    # "running wvWare through IMP to view word documents as html. It gets 
     535    # invoked like this: 
     536    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output" 
     537     
     538    # toppath is the folder where html is generated 
     539    # docname is the name (without extension) of the html to be generated 
     540    # suffix (extension) is thrown away 
     541    my ($docname, $toppath)  
     542    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
     543 
     544    # We want the image folder generated to have the same name as windows 
     545    # would generate ($windows_scripting) when it converts from word to html. 
     546    # That is, foldername=docname_files 
     547    my $assoc_dir = &util::filename_cat($toppath, $docname."_files"); 
     548    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files" 
     549     
     550    # ensure this image directory exists 
     551    # if it exists already, just delete and recreate 
     552    if(-e $assoc_dir) {  
     553    &util::rm_r($assoc_dir); 
     554    }   
     555    &util::mk_dir($assoc_dir); 
     556 
     557    # the images are all going to be called image0, image1,..., imageN 
     558    my $img_basenames = &util::filename_cat($assoc_dir, $docname); 
     559     
     560    #print STDERR "****toppath: $toppath\n****docname: $docname\n; 
     561    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames); 
     562    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir); 
     563 
    508564    my $cmd = ""; 
    509565    if ($timeout) {$cmd = "ulimit -t $timeout;";} 
    510     $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 
     566    # wvWare's --dir and --basename options for image directory.  
     567    # Replaced the next line with the *2 lines* following it: 
     568               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\""; 
     569    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";  
     570    $cmd .= " --charset utf-8 --config \"$wv_conf\""; 
    511571    $cmd .= " \"$input_filename\" > \"$output_filestem.html\""; 
    512      
     572 
    513573    # redirecting STDERR is a bad idea on windows 95/98 
    514574    $cmd .= " 2> \"$output_filestem.err\"" 
     
    545605    # Was the conversion successful? 
    546606 
    547     if (-s "$output_filestem.html") { 
     607    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents) 
    548608    open(TMP, "$output_filestem.html"); 
    549609    $line = <TMP>; 
    550610    close(TMP); 
    551611    if ($line && $line =~ /DOCTYPE HTML/) { 
    552         &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 
     612        &util::rm("$output_filestem.err") if -e "$output_filestem.err";     
     613 
     614        # Inserted this code to remove the images directory if it was still empty after  
     615        # the html was generated (in case there were no images in the word document) 
     616        if(is_dir_empty($assoc_dir)) { 
     617        print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 
     618        &util::rm_r($assoc_dir); 
     619        } else { # there was an image folder (it was generated) 
     620        # Therefore, the html file generated contains absolute links to the images 
     621        # If the folder contains images 
     622        # Replace them with relative links instead, so it can be moved elsewhere 
     623        make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");     
     624        } 
    553625        return 1; 
    554626    } 
     
    568640     
    569641    return 0; 
     642} 
     643 
     644# Method to work with doc_to_html - Word docs might contain images. 
     645# When such word docs are converted with wvWare, we make it generate a  
     646# <filename>_files folder with the associated images, while the html file 
     647# <filename> refers to the images using absolute paths to <filename>_files. 
     648# This method reads in that html file and replaces all the absolute paths to  
     649# the images in <filename>_files with the relative paths to the images from 
     650# that folder. (I.e. with <filename>_files/<imagename.ext>). 
     651sub make_links_to_assocdir_relative{ 
     652    # toppath is the top-level folder in which the html file we're going to be fixing resides 
     653    # docname is just the name (without extension) of the html file 
     654    # html_file is the full path to the html file: /full/path/docname.html 
     655    # assoc_dir_path is toppath/docname_files 
     656    # assoc_dirname is the directory name of the folder with associated imgs: docname_files 
     657    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_; 
     658 
     659    # 1. Read all the contents of the html into a string 
     660    # open the original file for reading 
     661    unless(open(FIN, "<$html_file")) {  
     662    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR\n"; 
     663    return; 
     664    } 
     665    # From http://perl.plover.com/local.html 
     666    # "It's cheaper to read the file all at once, without all the splitting and reassembling.  
     667    # (Some people call this slurping the file.) Perl has a special feature to support this:  
     668    # If the $/ variable is undefined, the <...> operator will read the entire file all at once" 
     669    $/ = undef;                 # Read entire file at once 
     670    my $html_contents = <FIN>;  # Now file is read in as one single 'line' 
     671    close(FIN); # close the file 
     672    print STDERR $html_contents; 
     673    
     674    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src 
     675    # values with assoc_dirname 
     676    # At the end: g means substitute all occurrences (global), while s at the end means treat  
     677    # all new lines as a regular space. This interacts with g to consider all the lines  
     678    # together as a single line so that multi-occurrences can be replaced. 
     679     
     680    # The following regular expression substitution looks for <a or <image, followed by any other  
     681    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=  
     682    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname  
     683    # followed by characters (for the img filename), then finally the optional closing quotes  
     684    # in " or ' form, followed by any other attributes and values until the first > to end the tag. 
     685    # The substitution: all the parts preceding associated folder's pathname are retained, 
     686    # the associated folder path name is replaced by associated folder directory name 
     687    # and the rest upto and including the closing > tag is retained. 
     688    # The sg at the end of the pattern match treats all of html_contents as a single line (s)  
     689    # and performs a global replace (g) meaning that all occurrences that match in that single line 
     690    # are substituted. 
     691    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$assoc_dir_path(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg; 
     692    #$html_contents =~ s/$assoc_dir_path/$assoc_dirname/gs; # this works, used as fall-back 
     693 
     694    # delete the original file and recreate it 
     695    my $copy_of_filename = $html_file; 
     696    &util::rm($copy_of_filename); # deleted the file 
     697 
     698    # Recreate the original file for writing the updated contents 
     699    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing 
     700    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR\n"; 
     701    return; 
     702    } 
     703    # write out the updated contents and close the file 
     704    print FOUT $html_contents; 
     705    close(FOUT); 
     706    
     707} 
     708 
     709# A method to check if directory is empty (note that an empty directory still has non-zero size!!!)  
     710# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831 
     711sub is_dir_empty 
     712{ 
     713    my ($path) = @_; 
     714    opendir DIR, $path; 
     715    while(my $entry = readdir DIR) { 
     716        next if($entry =~ /^\.\.?$/); 
     717        closedir DIR; 
     718        return 0; 
     719    } 
     720    closedir DIR; 
     721    return 1; 
    570722} 
    571723 
     
    12501402      return 0; 
    12511403    } 
    1252      
     1404 
     1405    print STDERR "\n**** In any to text****\n\n"; 
    12531406    open(IN, "<$input_filename") || return 0; 
    12541407    binmode(IN);