Context Navigation

← Previous Changeset
Next Changeset →

Changeset 15120

Timestamp:

2008-03-20T21:08:15+13:00 (16 years ago)

Author:

ak19

Message:

Changes to work with new script replace_srcdoc_with_html.pl: added to subroutine doc_to_html, created new subs make_links_to_assocdir_relative and is_dir_empty

File:

: 1 edited

gsdl/trunk/bin/script/gsConvert.pl (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/bin/script/gsConvert.pl

-              r12704
+              r15120
+    }
+    }
     return &convertAnything($input_filename, $output_filestem, $output_type);
+}
 …
                       "packages", "wv", "wvHtml.xml");
+    # Added the following to work with replace_srcdoc_with_html.pl:
+    # Make wvWare put any associated (image) files of the word doc into
+    # folder docname-without-extention_files. This folder should be at
+    # the same level as the html file generated from the doc.
+    # wvWare will take care of proper interlinking.
+    # This step is necessary for replace_srcdoc_with_html.pl which will
+    # move the html and associated files into the import folder. We
+    # want to ensure that the associated files won't overwrite similarly
+    # named items already in import. Hence we put them in a folder first
+    # (to which the html links properly) and that will allow
+    # replace_srcdoc_with_html.pl to move them safely to /import.
+    # To do all this, we need to use wvWare's --dir and --basename options
+    # where dir is the full path to the image folder directory and
+    # basename is the full path to the image folder appended to the name
+    # which is to be prepended to every image file:
+    # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
+    # then the basename is "/full/path/to/imgdir/sample".
+    # In this case, basename is the full path to and name of the document.
+    # HOWEVER: basename always takes full path, not relative url, so
+    # the greenstone browser is unable to display the images (absolute paths
+    # cause it to give an "external link" message)
+    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
+    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
+    # "added --dir option to wvHtml so that pictures can be placed in
+    # a seperate directory"
+    # "running wvWare through IMP to view word documents as html. It gets
+    # invoked like this:
+    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
+    # toppath is the folder where html is generated
+    # docname is the name (without extension) of the html to be generated
+    # suffix (extension) is thrown away
+    my ($docname, $toppath)
+    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    # We want the image folder generated to have the same name as windows
+    # would generate ($windows_scripting) when it converts from word to html.
+    # That is, foldername=docname_files
+    my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
+    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
+    # ensure this image directory exists
+    # if it exists already, just delete and recreate
+    if(-e $assoc_dir) {
+    &util::rm_r($assoc_dir);
+    }
+    &util::mk_dir($assoc_dir);
+    # the images are all going to be called image0, image1,..., imageN
+    my $img_basenames = &util::filename_cat($assoc_dir, $docname);
+    #print STDERR "****toppath: $toppath\n****docname: $docname\n;
+    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
+    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
     my $cmd = "";
     if ($timeout) {$cmd = "ulimit -t $timeout;";}
+    $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
+    # wvWare's --dir and --basename options for image directory.
+    # Replaced the next line with the *2 lines* following it:
+               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
+    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
+    $cmd .= " --charset utf-8 --config \"$wv_conf\"";
     $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
     # redirecting STDERR is a bad idea on windows 95/98
     $cmd .= " 2> \"$output_filestem.err\""
 …
     # Was the conversion successful?
     if (-s "$output_filestem.html") {
+    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
     open(TMP, "$output_filestem.html");
     $line = <TMP>;
     close(TMP);
     if ($line && $line =~ /DOCTYPE HTML/) {
+        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
+        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
+        # Inserted this code to remove the images directory if it was still empty after
+        # the html was generated (in case there were no images in the word document)
+        if(is_dir_empty($assoc_dir)) {
+        print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
+        &util::rm_r($assoc_dir);
+        } else { # there was an image folder (it was generated)
+        # Therefore, the html file generated contains absolute links to the images
+        # If the folder contains images
+        # Replace them with relative links instead, so it can be moved elsewhere
+        make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
+        }
         return 1;
+    }
 …
     return 0;
+}
+# Method to work with doc_to_html - Word docs might contain images.
+# When such word docs are converted with wvWare, we make it generate a
+# <filename>_files folder with the associated images, while the html file
+# <filename> refers to the images using absolute paths to <filename>_files.
+# This method reads in that html file and replaces all the absolute paths to
+# the images in <filename>_files with the relative paths to the images from
+# that folder. (I.e. with <filename>_files/<imagename.ext>).
+sub make_links_to_assocdir_relative{
+    # toppath is the top-level folder in which the html file we're going to be fixing resides
+    # docname is just the name (without extension) of the html file
+    # html_file is the full path to the html file: /full/path/docname.html
+    # assoc_dir_path is toppath/docname_files
+    # assoc_dirname is the directory name of the folder with associated imgs: docname_files
+    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
+    # 1. Read all the contents of the html into a string
+    # open the original file for reading
+    unless(open(FIN, "<$html_file")) {
+    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR\n";
+    return;
+    }
+    # From http://perl.plover.com/local.html
+    # "It's cheaper to read the file all at once, without all the splitting and reassembling.
+    # (Some people call this slurping the file.) Perl has a special feature to support this:
+    # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
+    $/ = undef;                 # Read entire file at once
+    my $html_contents = <FIN>;  # Now file is read in as one single 'line'
+    close(FIN); # close the file
+    print STDERR $html_contents;
+    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
+    # values with assoc_dirname
+    # At the end: g means substitute all occurrences (global), while s at the end means treat
+    # all new lines as a regular space. This interacts with g to consider all the lines
+    # together as a single line so that multi-occurrences can be replaced.
+    # The following regular expression substitution looks for <a or <image, followed by any other
+    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
+    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
+    # followed by characters (for the img filename), then finally the optional closing quotes
+    # in " or ' form, followed by any other attributes and values until the first > to end the tag.
+    # The substitution: all the parts preceding associated folder's pathname are retained,
+    # the associated folder path name is replaced by associated folder directory name
+    # and the rest upto and including the closing > tag is retained.
+    # The sg at the end of the pattern match treats all of html_contents as a single line (s)
+    # and performs a global replace (g) meaning that all occurrences that match in that single line
+    # are substituted.
+    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$assoc_dir_path(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
+    #$html_contents =~ s/$assoc_dir_path/$assoc_dirname/gs; # this works, used as fall-back
+    # delete the original file and recreate it
+    my $copy_of_filename = $html_file;
+    &util::rm($copy_of_filename); # deleted the file
+    # Recreate the original file for writing the updated contents
+    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
+    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR\n";
+    return;
+    }
+    # write out the updated contents and close the file
+    print FOUT $html_contents;
+    close(FOUT);
+}
+# A method to check if directory is empty (note that an empty directory still has non-zero size!!!)
+# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
+sub is_dir_empty
+{
+    my ($path) = @_;
+    opendir DIR, $path;
+    while(my $entry = readdir DIR) {
+        next if($entry =~ /^\.\.?$/);
+        closedir DIR;
+        return 0;
+    }
+    closedir DIR;
+    return 1;
+}
 …
       return 0;
+    }
+    print STDERR "\n**** In any to text****\n\n";
     open(IN, "<$input_filename") || return 0;
     binmode(IN);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 15120

Legend:

gsdl/trunk/bin/script/gsConvert.pl

Download in other formats: