Changeset 16435

Show
Ignore:
Timestamp:
16.07.2008 17:03:13 (11 years ago)
Author:
ak19
Message:

1. Correction to sub make_links_to_assocdir_relative which processes associated directories generated for Word documents that have been converted to html, so that it now works on Windows. It now processes Windows-style backslashes in paths. 2. Minor cosmetic changes, regular expressions where the type of regex is not specified now have an m explicitly prepended to them.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/gsConvert.pl

    r15168 r16435  
    215215 
    216216    my $success = 0; 
    217     if (!$output_type || ($output_type =~ /html/i)){ 
     217    if (!$output_type || ($output_type =~ m/html/i)){ 
    218218    if ($windows_scripting) { 
    219219        $success = &native_doc_to_html($input_filename, $output_filestem); 
     
    238238 
    239239    # Attempt specialised conversion to HTML 
    240     if (!$output_type || ($output_type =~ /html/i)) { 
     240    if (!$output_type || ($output_type =~ m/html/i)) { 
    241241 
    242242    if ($windows_scripting) { 
     
    266266   
    267267    # Attempt simple conversion to HTML 
    268     if (!$output_type || ($output_type =~ /html/i)) { 
     268    if (!$output_type || ($output_type =~ m/html/i)) { 
    269269    $success = &any_to_html($input_filename, $output_filestem); 
    270270    if ($success) { 
     
    274274 
    275275    # Convert to text 
    276     if (!$output_type || ($output_type =~ /text/i)) { 
     276    if (!$output_type || ($output_type =~ m/text/i)) { 
    277277    $success = &any_to_text($input_filename, $output_filestem); 
    278278    if ($success) { 
     
    293293    $output_type =~ s/.*\-(.*)/$1/i; 
    294294    # Attempt coversion to Image 
    295     if ($output_type =~ /jp?g|gif|png/i) { 
     295    if ($output_type =~ m/jp?g|gif|png/i) { 
    296296    $success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type); 
    297297    if ($success){ 
     
    301301 
    302302    # Attempt conversion to HTML 
    303     if (!$output_type || ($output_type =~ /html/i)) { 
     303    if (!$output_type || ($output_type =~ m/html/i)) { 
    304304    $success = &pdf_to_html($dirname, $input_filename, $output_filestem); 
    305305    if ($success) { 
     
    309309 
    310310    # Attempt conversion to TEXT 
    311     if (!$output_type || ($output_type =~ /text/i)) { 
     311    if (!$output_type || ($output_type =~ m/text/i)) { 
    312312    $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 
    313313    if ($success) { 
     
    329329    $output_type =~ s/.*\-(.*)/$1/i; 
    330330    # Attempt coversion to Image 
    331     if ($output_type =~ /jp?g|gif|png/i) { 
     331    if ($output_type =~ m/jp?g|gif|png/i) { 
    332332    $success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type); 
    333333    if ($success){ 
     
    337337 
    338338    # Attempt conversion to TEXT 
    339     if (!$output_type || ($output_type =~ /text/i)) { 
     339    if (!$output_type || ($output_type =~ m/text/i)) { 
    340340    $success = &ps_to_text($input_filename, $output_filestem); 
    341341    if ($success) { 
     
    352352 
    353353    my $ppt_convert_type = ""; 
    354     #if (!$output_type || $windows_scripting ||($output_type !~ /html/i) ||($output_type !~ /text/i)){ 
    355     if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){ 
    356     if ($output_type =~ /gif/i) { 
     354    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){ 
     355    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){ 
     356    if ($output_type =~ m/gif/i) { 
    357357        $ppt_convert_type = "-g"; 
    358     } elsif ($output_type =~ /jp?g/i){ 
     358    } elsif ($output_type =~ m/jp?g/i){ 
    359359        $ppt_convert_type = "-j"; 
    360     } elsif ($output_type =~ /png/i){ 
     360    } elsif ($output_type =~ m/png/i){ 
    361361        $ppt_convert_type = "-p"; 
    362362    } 
    363363    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 
    364364                       $ENV{'GSDLOS'}, "pptextract"); 
    365     $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i); 
     365    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
    366366             
    367367    $cmd = ""; 
     
    374374        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\""; 
    375375        $cmd .= " 2>\"$output_filestem.err\"" 
    376         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 
     376        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    377377        if (system($cmd) !=0) { 
    378378        print STDERR "Powerpoint VB Scripting convert failed\n"; 
     
    381381        } 
    382382    } 
    383     } elsif (!$output_type || ($output_type =~ /html/i)) { 
     383    } elsif (!$output_type || ($output_type =~ m/html/i)) { 
    384384    # Attempt conversion to HTML 
    385     #if (!$output_type || ($output_type =~ /html/i)) { 
     385    #if (!$output_type || ($output_type =~ m/html/i)) { 
    386386    # formulate the command 
    387387    $cmd = ""; 
     
    389389    $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 
    390390    $cmd .= " 2>\"$output_filestem.err\"" 
    391         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 
     391        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    392392 
    393393    # execute the command 
     
    416416 
    417417    # Attempt conversion to HTML 
    418     if (!$output_type || ($output_type =~ /html/i)) { 
     418    if (!$output_type || ($output_type =~ m/html/i)) { 
    419419    # formulate the command 
    420420    $cmd = ""; 
     
    422422    $cmd .= " \"$input_filename\" \"$output_filestem.html\""; 
    423423    $cmd .= " 2>\"$output_filestem.err\"" 
    424         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 
     424        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    425425     
    426426     
     
    463463    if ($first) { 
    464464        # check to see if this is an rtf file 
    465         if ($line =~ /^\{\\rtf/) { 
     465        if ($line =~ m/^\{\\rtf/) { 
    466466        close(CHK); 
    467467        return "rtf"; 
     
    471471     
    472472    # is this is a word 6/7/8 document? 
    473     if ($line =~ /Word\.Document\.([678])/) { 
     473    if ($line =~ m/Word\.Document\.([678])/) { 
    474474        close(CHK); 
    475475        return "word$1"; 
     
    500500    # with quoting when GSDLHOME might contain spaces) but assume 
    501501    # that the PATH is set up correctly 
    502     $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i); 
     502    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
    503503 
    504504    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",  
     
    573573    # redirecting STDERR is a bad idea on windows 95/98 
    574574    $cmd .= " 2> \"$output_filestem.err\"" 
    575     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 
     575    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    576576    # execute the command 
    577577    $!=0; 
     
    588588        my $line; 
    589589        while ($line=<ERRFILE>) { 
    590         if ($line =~ /\w/) { 
     590        if ($line =~ m/\w/) { 
    591591            print STDERR "$line"; 
    592592            print FAILLOG "$line" if ($write_to_fail_log); 
     
    609609    $line = <TMP>; 
    610610    close(TMP); 
    611     if ($line && $line =~ /DOCTYPE HTML/) { 
     611    if ($line && $line =~ m/DOCTYPE HTML/) { 
    612612        &util::rm("$output_filestem.err") if -e "$output_filestem.err";     
    613613 
    614614        # Inserted this code to remove the images directory if it was still empty after  
    615615        # the html was generated (in case there were no images in the word document) 
    616         if(&is_dir_empty($assoc_dir)) { 
     616        if (&util::is_dir_empty($assoc_dir)) { 
    617617        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n"; 
    618618        &util::rm_r($assoc_dir); 
    619619        } else { # there was an image folder (it was generated) 
    620620        # Therefore, the html file generated contains absolute links to the images 
    621         # If the folder contains images 
    622         # Replace them with relative links instead, so it can be moved elsewhere 
     621        # Replace them with relative links instead, so the folder can be moved elsewhere 
    623622        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");    
    624623        } 
     
    640639     
    641640    return 0; 
    642 } 
    643  
    644  
    645 # A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)  
    646 # Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831 
    647 sub is_dir_empty 
    648 { 
    649     my ($path) = @_; 
    650     opendir DIR, $path; 
    651     while(my $entry = readdir DIR) { 
    652         next if($entry =~ /^\.\.?$/); 
    653         closedir DIR; 
    654         return 0; 
    655     } 
    656     closedir DIR; 
    657     return 1; 
    658641} 
    659642 
     
    699682    # we can't just replace $assoc_dir_path with $assoc_dir 
    700683    # $assoc_dir_path represents a regular expression that needs to be replaced 
    701     # if it contains ., -, [ or ] -- which all have special meaning in Perl regular expressions -- 
    702     # we need to escape these first 
     684    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special 
     685    # meaning in Perl regular expressions -- we need to escape these first 
    703686    my $safe_reg_expression = $assoc_dir_path; 
     687    $safe_reg_expression =~ s/\\/\\\\/g; 
    704688    $safe_reg_expression =~ s/\./\\./g; 
    705689    $safe_reg_expression =~ s/\-/\\-/g; 
     
    722706               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back 
    723707    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this 
    724     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&percent_twenty_to_space($1, $5, $6)/sge; 
    725     
    726     #print STDERR "assoc_dirname: ****$assoc_dirname***\n"; 
    727     #print STDERR "safe_reg_expression: ****$safe_reg_expression***\n"; 
     708    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge; 
     709 
     710    print STDERR "****assoc_dirname: $assoc_dirname***\n"; 
     711    print STDERR "****safe_reg_expression: $safe_reg_expression***\n"; 
    728712    
    729713    # delete the original file and recreate it 
     
    736720    return 0; 
    737721    } 
     722 
    738723    # write out the updated contents and close the file 
    739724    print FOUT $html_contents; 
     
    742727} 
    743728 
    744 # Utility routine to convert all %20 introduced by wvWare in link pathnames into space again 
    745 sub percent_twenty_to_space 
     729# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain  
     730# url slashes (/) instead of windows-style backwards slashes, and to convert all %20  
     731# introduced in link pathnames by wvWare into space again  
     732sub post_process_assocfile_urls 
    746733{ 
    747734    my ($pre, $text, $post) = @_; 
    748735 
    749736    $text =~ s/%20/ /g; 
     737    $text =~ s/\\/\//g; 
    750738 
    751739    return "$pre$text$post"; 
     
    759747                       $ENV{'GSDLOS'}, "word2html"); 
    760748 
    761     $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i); 
     749    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 
    762750    if (-e "$output_filestem.html") { 
    763751    print STDERR "*** The conversion file has existed\n"; 
     
    773761    # redirecting STDERR 
    774762    $cmd .= " 2> \"$output_filestem.err\"" 
    775     if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 
     763    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    776764     
    777765    # execute the command 
     
    789777        my $line; 
    790778        while ($line=<ERRFILE>) { 
    791         if ($line =~ /\w/) { 
     779        if ($line =~ m/\w/) { 
    792780            print STDERR "$line"; 
    793781            print FAILLOG "$line" if ($write_to_fail_log); 
     
    809797    $line = <TMP>; 
    810798    close(TMP); 
    811     if ($line && $line =~ /html/) { 
     799    if ($line && $line =~ m/html/) { 
    812800        &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 
    813801        return 1; 
     
    843831 
    844832    $cmd .= " 2>\"$output_filestem.err\"" 
    845         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 
     833        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 
    846834 
    847835 
     
    865853 
    866854        if ($past_header == 0) { 
    867         if ($line =~ /<body>/) {$past_header=1;} 
     855        if ($line =~ m/<body>/) {$past_header=1;} 
    868856        next; 
    869857        } 
    870858 
    871859        $line =~ s/<[^>]+>//g; 
    872         if ($line =~ /\w/ && $past_header) {  # we found some content... 
     860        if ($line =~ m/\w/ && $past_header) {  # we found some content... 
    873861        $was_successful=1; 
    874862        last; 
     
    898886 
    899887        # print out header info from src html. 
    900         while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) { 
     888        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) { 
    901889        print HTML "$_"; 
    902890        } 
     
    962950    $cmd .= " \"$input_filename\" \"$output_filestem\""; 
    963951     
    964     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) { 
     952    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 
    965953    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 
    966954    } else { 
     
    10291017    $output_type =~ s/.*\_(.*)/$1/i; 
    10301018     $cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\""; 
    1031     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) { 
     1019    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 
    10321020    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 
    10331021    } else { 
     
    10861074    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\""; 
    10871075 
    1088     if ($ENV{'GSDLOS'} !~ /^windows$/i) { 
     1076    if ($ENV{'GSDLOS'} !~ m/^windows$/i) { 
    10891077    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 
    10901078    } else { 
     
    11051093    my $seen_text=0; 
    11061094    while (($seen_text==0) && ($line=<EXTR_TEXT>)) { 
    1107         if ($line=~ /\w/) {$seen_text=1;} 
     1095        if ($line=~ m/\w/) {$seen_text=1;} 
    11081096    } 
    11091097    close EXTR_TEXT; 
     
    11571145    # if we're on windows we'll fall straight through without attempting 
    11581146    # to use gs 
    1159     if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     1147    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
    11601148    $error = "Windows does not support gs"; 
    11611149 
     
    11811169        # possible for the actual text to start with this, but.... 
    11821170        open PSOUT, "$output_filestem.text"; 
    1183         if (<PSOUT> =~ /^Error: (.*)/) { 
     1171        if (<PSOUT> =~ m/^Error: (.*)/) { 
    11841172        $error="interpreter error - \"$1\""; 
    11851173        } 
     
    12231211 
    12241212    # Make sure this is a ps file... 
    1225     if ($text !~ /^%!/) { 
     1213    if ($text !~ m/^%!/) { 
    12261214        print STDERR "Bad postscript header: not '%!'\n"; 
    12271215        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile"))) 
     
    13341322    $output_type =~ s/.*\_(.*)/$1/i; 
    13351323    $cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\""; 
    1336     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) { 
     1324    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 
    13371325    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 
    13381326    } else { 
     
    14061394    $line =~ s/</&lt;/g; 
    14071395    $line =~ s/>/&gt;/g; 
    1408     if ($line =~ /^\s*$/) { 
     1396    if ($line =~ m/^\s*$/) { 
    14091397        print HTML "<p>"; 
    14101398    } else { 
     
    14471435    # delete any string less than 10 characters long 
    14481436    $line =~ s/^.{0,9}$/\n/mg; 
    1449     while ($line =~ /^.{1,9}$/m) { 
     1437    while ($line =~ m/^.{1,9}$/m) { 
    14501438        $line =~ s/^.{0,9}$/\n/mg; 
    14511439        $line =~ s/\n+/\n/sg; 
     
    14571445 
    14581446    # output whatever is left 
    1459     if ($line =~ /[^\n ]/) { 
     1447    if ($line =~ m/[^\n ]/) { 
    14601448        print OUT $line; 
    14611449        ++$output_line_count;