Changeset 15152

Show
Ignore:
Timestamp:
29.03.2008 17:53:51 (11 years ago)
Author:
ak19
Message:

Regular expression in make_links_to_assocdir_relative is corrected and a utility subroutine is added

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/gsConvert.pl

    r15120 r15152  
    614614        # Inserted this code to remove the images directory if it was still empty after  
    615615        # the html was generated (in case there were no images in the word document) 
    616         if(is_dir_empty($assoc_dir)) { 
    617         print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 
     616        if(&is_dir_empty($assoc_dir)) { 
     617        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 
    618618        &util::rm_r($assoc_dir); 
    619619        } else { # there was an image folder (it was generated) 
     
    621621        # If the folder contains images 
    622622        # Replace them with relative links instead, so it can be moved elsewhere 
    623         make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");     
     623        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");    
    624624        } 
    625625        return 1; 
     
    640640     
    641641    return 0; 
     642} 
     643 
     644 
     645# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)  
     646# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831 
     647sub is_dir_empty 
     648{ 
     649    my ($path) = @_; 
     650    opendir DIR, $path; 
     651    while(my $entry = readdir DIR) { 
     652        next if($entry =~ /^\.\.?$/); 
     653        closedir DIR; 
     654        return 0; 
     655    } 
     656    closedir DIR; 
     657    return 1; 
    642658} 
    643659 
     
    661677    unless(open(FIN, "<$html_file")) {  
    662678    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR\n"; 
    663     return; 
     679    return 0; 
    664680    } 
    665681    # From http://perl.plover.com/local.html 
     
    667683    # (Some people call this slurping the file.) Perl has a special feature to support this:  
    668684    # If the $/ variable is undefined, the <...> operator will read the entire file all at once" 
    669     $/ = undef;                 # Read entire file at once 
    670     my $html_contents = <FIN>;  # Now file is read in as one single 'line' 
     685    my $html_contents; 
     686    { 
     687    local $/ = undef;        # Read entire file at once 
     688    $html_contents = <FIN>;  # Now file is read in as one single 'line' 
     689    } 
    671690    close(FIN); # close the file 
    672     print STDERR $html_contents; 
     691    #print STDERR $html_contents; 
    673692    
    674693    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src 
     
    677696    # all new lines as a regular space. This interacts with g to consider all the lines  
    678697    # together as a single line so that multi-occurrences can be replaced. 
    679      
     698 
     699    # we can't just replace $assoc_dir_path with $assoc_dir 
     700    # $assoc_dir_path represents a regular expression that needs to be replaced 
     701    # if it contains ., -, [ or ] -- which all have special meaning in Perl regular expressions -- 
     702    # we need to escape these first 
     703    my $safe_reg_expression = $assoc_dir_path; 
     704    $safe_reg_expression =~ s/\./\\./g; 
     705    $safe_reg_expression =~ s/\-/\\-/g; 
     706    $safe_reg_expression =~ s/\[/\\[/g; 
     707    $safe_reg_expression =~ s/\]/\\]/g; 
     708    $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match 
     709 
    680710    # The following regular expression substitution looks for <a or <image, followed by any other  
    681711    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=  
     
    689719    # and performs a global replace (g) meaning that all occurrences that match in that single line 
    690720    # are substituted. 
    691     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$assoc_dir_path(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg; 
    692     #$html_contents =~ s/$assoc_dir_path/$assoc_dirname/gs; # this works, used as fall-back 
    693  
     721    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg; 
     722               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back 
     723    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this 
     724    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&percent_twenty_to_space($1, $5, $6)/sge; 
     725    
     726    #print STDERR "assoc_dirname: ****$assoc_dirname***\n"; 
     727    #print STDERR "safe_reg_expression: ****$safe_reg_expression***\n"; 
     728    
    694729    # delete the original file and recreate it 
    695730    my $copy_of_filename = $html_file; 
     
    699734    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing 
    700735    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR\n"; 
    701     return; 
     736    return 0; 
    702737    } 
    703738    # write out the updated contents and close the file 
    704739    print FOUT $html_contents; 
    705740    close(FOUT); 
    706     
    707 } 
    708  
    709 # A method to check if directory is empty (note that an empty directory still has non-zero size!!!)  
    710 # Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831 
    711 sub is_dir_empty 
     741    return 1; 
     742} 
     743 
     744# Utility routine to convert all %20 introduced by wvWare in link pathnames into space again 
     745sub percent_twenty_to_space 
    712746{ 
    713     my ($path) = @_; 
    714     opendir DIR, $path; 
    715     while(my $entry = readdir DIR) { 
    716         next if($entry =~ /^\.\.?$/); 
    717         closedir DIR; 
    718         return 0; 
    719     } 
    720     closedir DIR; 
    721     return 1; 
    722 } 
    723  
     747    my ($pre, $text, $post) = @_; 
     748 
     749    $text =~ s/%20/ /g; 
     750 
     751    return "$pre$text$post"; 
     752} 
    724753 
    725754# Attempt to convert a word document to html with the word2html scripting program