Changeset 16435


Ignore:
Timestamp:
2008-07-16T17:03:13+12:00 (14 years ago)
Author:
ak19
Message:
  1. Correction to sub make_links_to_assocdir_relative which processes associated directories generated for Word documents that have been converted to html, so that it now works on Windows. It now processes Windows-style backslashes in paths. 2. Minor cosmetic changes, regular expressions where the type of regex is not specified now have an m explicitly prepended to them.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/bin/script/gsConvert.pl

    r15168 r16435  
    215215
    216216    my $success = 0;
    217     if (!$output_type || ($output_type =~ /html/i)){
     217    if (!$output_type || ($output_type =~ m/html/i)){
    218218    if ($windows_scripting) {
    219219        $success = &native_doc_to_html($input_filename, $output_filestem);
     
    238238
    239239    # Attempt specialised conversion to HTML
    240     if (!$output_type || ($output_type =~ /html/i)) {
     240    if (!$output_type || ($output_type =~ m/html/i)) {
    241241
    242242    if ($windows_scripting) {
     
    266266 
    267267    # Attempt simple conversion to HTML
    268     if (!$output_type || ($output_type =~ /html/i)) {
     268    if (!$output_type || ($output_type =~ m/html/i)) {
    269269    $success = &any_to_html($input_filename, $output_filestem);
    270270    if ($success) {
     
    274274
    275275    # Convert to text
    276     if (!$output_type || ($output_type =~ /text/i)) {
     276    if (!$output_type || ($output_type =~ m/text/i)) {
    277277    $success = &any_to_text($input_filename, $output_filestem);
    278278    if ($success) {
     
    293293    $output_type =~ s/.*\-(.*)/$1/i;
    294294    # Attempt coversion to Image
    295     if ($output_type =~ /jp?g|gif|png/i) {
     295    if ($output_type =~ m/jp?g|gif|png/i) {
    296296    $success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
    297297    if ($success){
     
    301301
    302302    # Attempt conversion to HTML
    303     if (!$output_type || ($output_type =~ /html/i)) {
     303    if (!$output_type || ($output_type =~ m/html/i)) {
    304304    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
    305305    if ($success) {
     
    309309
    310310    # Attempt conversion to TEXT
    311     if (!$output_type || ($output_type =~ /text/i)) {
     311    if (!$output_type || ($output_type =~ m/text/i)) {
    312312    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
    313313    if ($success) {
     
    329329    $output_type =~ s/.*\-(.*)/$1/i;
    330330    # Attempt coversion to Image
    331     if ($output_type =~ /jp?g|gif|png/i) {
     331    if ($output_type =~ m/jp?g|gif|png/i) {
    332332    $success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type);
    333333    if ($success){
     
    337337
    338338    # Attempt conversion to TEXT
    339     if (!$output_type || ($output_type =~ /text/i)) {
     339    if (!$output_type || ($output_type =~ m/text/i)) {
    340340    $success = &ps_to_text($input_filename, $output_filestem);
    341341    if ($success) {
     
    352352
    353353    my $ppt_convert_type = "";
    354     #if (!$output_type || $windows_scripting ||($output_type !~ /html/i) ||($output_type !~ /text/i)){
    355     if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){
    356     if ($output_type =~ /gif/i) {
     354    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
     355    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
     356    if ($output_type =~ m/gif/i) {
    357357        $ppt_convert_type = "-g";
    358     } elsif ($output_type =~ /jp?g/i){
     358    } elsif ($output_type =~ m/jp?g/i){
    359359        $ppt_convert_type = "-j";
    360     } elsif ($output_type =~ /png/i){
     360    } elsif ($output_type =~ m/png/i){
    361361        $ppt_convert_type = "-p";
    362362    }
    363363    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
    364364                       $ENV{'GSDLOS'}, "pptextract");
    365     $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
     365    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
    366366           
    367367    $cmd = "";
     
    374374        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
    375375        $cmd .= " 2>\"$output_filestem.err\""
    376         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     376        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    377377        if (system($cmd) !=0) {
    378378        print STDERR "Powerpoint VB Scripting convert failed\n";
     
    381381        }
    382382    }
    383     } elsif (!$output_type || ($output_type =~ /html/i)) {
     383    } elsif (!$output_type || ($output_type =~ m/html/i)) {
    384384    # Attempt conversion to HTML
    385     #if (!$output_type || ($output_type =~ /html/i)) {
     385    #if (!$output_type || ($output_type =~ m/html/i)) {
    386386    # formulate the command
    387387    $cmd = "";
     
    389389    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
    390390    $cmd .= " 2>\"$output_filestem.err\""
    391         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     391        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    392392
    393393    # execute the command
     
    416416
    417417    # Attempt conversion to HTML
    418     if (!$output_type || ($output_type =~ /html/i)) {
     418    if (!$output_type || ($output_type =~ m/html/i)) {
    419419    # formulate the command
    420420    $cmd = "";
     
    422422    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
    423423    $cmd .= " 2>\"$output_filestem.err\""
    424         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     424        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    425425   
    426426   
     
    463463    if ($first) {
    464464        # check to see if this is an rtf file
    465         if ($line =~ /^\{\\rtf/) {
     465        if ($line =~ m/^\{\\rtf/) {
    466466        close(CHK);
    467467        return "rtf";
     
    471471   
    472472    # is this is a word 6/7/8 document?
    473     if ($line =~ /Word\.Document\.([678])/) {
     473    if ($line =~ m/Word\.Document\.([678])/) {
    474474        close(CHK);
    475475        return "word$1";
     
    500500    # with quoting when GSDLHOME might contain spaces) but assume
    501501    # that the PATH is set up correctly
    502     $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
     502    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
    503503
    504504    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
     
    573573    # redirecting STDERR is a bad idea on windows 95/98
    574574    $cmd .= " 2> \"$output_filestem.err\""
    575     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     575    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    576576    # execute the command
    577577    $!=0;
     
    588588        my $line;
    589589        while ($line=<ERRFILE>) {
    590         if ($line =~ /\w/) {
     590        if ($line =~ m/\w/) {
    591591            print STDERR "$line";
    592592            print FAILLOG "$line" if ($write_to_fail_log);
     
    609609    $line = <TMP>;
    610610    close(TMP);
    611     if ($line && $line =~ /DOCTYPE HTML/) {
     611    if ($line && $line =~ m/DOCTYPE HTML/) {
    612612        &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
    613613
    614614        # Inserted this code to remove the images directory if it was still empty after
    615615        # the html was generated (in case there were no images in the word document)
    616         if(&is_dir_empty($assoc_dir)) {
     616        if (&util::is_dir_empty($assoc_dir)) {
    617617        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
    618618        &util::rm_r($assoc_dir);
    619619        } else { # there was an image folder (it was generated)
    620620        # Therefore, the html file generated contains absolute links to the images
    621         # If the folder contains images
    622         # Replace them with relative links instead, so it can be moved elsewhere
     621        # Replace them with relative links instead, so the folder can be moved elsewhere
    623622        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
    624623        }
     
    640639   
    641640    return 0;
    642 }
    643 
    644 
    645 # A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
    646 # Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
    647 sub is_dir_empty
    648 {
    649     my ($path) = @_;
    650     opendir DIR, $path;
    651     while(my $entry = readdir DIR) {
    652         next if($entry =~ /^\.\.?$/);
    653         closedir DIR;
    654         return 0;
    655     }
    656     closedir DIR;
    657     return 1;
    658641}
    659642
     
    699682    # we can't just replace $assoc_dir_path with $assoc_dir
    700683    # $assoc_dir_path represents a regular expression that needs to be replaced
    701     # if it contains ., -, [ or ] -- which all have special meaning in Perl regular expressions --
    702     # we need to escape these first
     684    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
     685    # meaning in Perl regular expressions -- we need to escape these first
    703686    my $safe_reg_expression = $assoc_dir_path;
     687    $safe_reg_expression =~ s/\\/\\\\/g;
    704688    $safe_reg_expression =~ s/\./\\./g;
    705689    $safe_reg_expression =~ s/\-/\\-/g;
     
    722706               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
    723707    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
    724     $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&percent_twenty_to_space($1, $5, $6)/sge;
    725    
    726     #print STDERR "assoc_dirname: ****$assoc_dirname***\n";
    727     #print STDERR "safe_reg_expression: ****$safe_reg_expression***\n";
     708    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
     709
     710    print STDERR "****assoc_dirname: $assoc_dirname***\n";
     711    print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
    728712   
    729713    # delete the original file and recreate it
     
    736720    return 0;
    737721    }
     722
    738723    # write out the updated contents and close the file
    739724    print FOUT $html_contents;
     
    742727}
    743728
    744 # Utility routine to convert all %20 introduced by wvWare in link pathnames into space again
    745 sub percent_twenty_to_space
     729# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
     730# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
     731# introduced in link pathnames by wvWare into space again
     732sub post_process_assocfile_urls
    746733{
    747734    my ($pre, $text, $post) = @_;
    748735
    749736    $text =~ s/%20/ /g;
     737    $text =~ s/\\/\//g;
    750738
    751739    return "$pre$text$post";
     
    759747                       $ENV{'GSDLOS'}, "word2html");
    760748
    761     $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
     749    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
    762750    if (-e "$output_filestem.html") {
    763751    print STDERR "*** The conversion file has existed\n";
     
    773761    # redirecting STDERR
    774762    $cmd .= " 2> \"$output_filestem.err\""
    775     if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     763    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    776764   
    777765    # execute the command
     
    789777        my $line;
    790778        while ($line=<ERRFILE>) {
    791         if ($line =~ /\w/) {
     779        if ($line =~ m/\w/) {
    792780            print STDERR "$line";
    793781            print FAILLOG "$line" if ($write_to_fail_log);
     
    809797    $line = <TMP>;
    810798    close(TMP);
    811     if ($line && $line =~ /html/) {
     799    if ($line && $line =~ m/html/) {
    812800        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
    813801        return 1;
     
    843831
    844832    $cmd .= " 2>\"$output_filestem.err\""
    845         if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
     833        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
    846834
    847835
     
    865853
    866854        if ($past_header == 0) {
    867         if ($line =~ /<body>/) {$past_header=1;}
     855        if ($line =~ m/<body>/) {$past_header=1;}
    868856        next;
    869857        }
    870858
    871859        $line =~ s/<[^>]+>//g;
    872         if ($line =~ /\w/ && $past_header) {  # we found some content...
     860        if ($line =~ m/\w/ && $past_header) {  # we found some content...
    873861        $was_successful=1;
    874862        last;
     
    898886
    899887        # print out header info from src html.
    900         while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
     888        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
    901889        print HTML "$_";
    902890        }
     
    962950    $cmd .= " \"$input_filename\" \"$output_filestem\"";
    963951   
    964     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
     952    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
    965953    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
    966954    } else {
     
    10291017    $output_type =~ s/.*\_(.*)/$1/i;
    10301018     $cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
    1031     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
     1019    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
    10321020    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
    10331021    } else {
     
    10861074    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
    10871075
    1088     if ($ENV{'GSDLOS'} !~ /^windows$/i) {
     1076    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
    10891077    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
    10901078    } else {
     
    11051093    my $seen_text=0;
    11061094    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
    1107         if ($line=~ /\w/) {$seen_text=1;}
     1095        if ($line=~ m/\w/) {$seen_text=1;}
    11081096    }
    11091097    close EXTR_TEXT;
     
    11571145    # if we're on windows we'll fall straight through without attempting
    11581146    # to use gs
    1159     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     1147    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
    11601148    $error = "Windows does not support gs";
    11611149
     
    11811169        # possible for the actual text to start with this, but....
    11821170        open PSOUT, "$output_filestem.text";
    1183         if (<PSOUT> =~ /^Error: (.*)/) {
     1171        if (<PSOUT> =~ m/^Error: (.*)/) {
    11841172        $error="interpreter error - \"$1\"";
    11851173        }
     
    12231211
    12241212    # Make sure this is a ps file...
    1225     if ($text !~ /^%!/) {
     1213    if ($text !~ m/^%!/) {
    12261214        print STDERR "Bad postscript header: not '%!'\n";
    12271215        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
     
    13341322    $output_type =~ s/.*\_(.*)/$1/i;
    13351323    $cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
    1336     if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
     1324    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
    13371325    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
    13381326    } else {
     
    14061394    $line =~ s/</&lt;/g;
    14071395    $line =~ s/>/&gt;/g;
    1408     if ($line =~ /^\s*$/) {
     1396    if ($line =~ m/^\s*$/) {
    14091397        print HTML "<p>";
    14101398    } else {
     
    14471435    # delete any string less than 10 characters long
    14481436    $line =~ s/^.{0,9}$/\n/mg;
    1449     while ($line =~ /^.{1,9}$/m) {
     1437    while ($line =~ m/^.{1,9}$/m) {
    14501438        $line =~ s/^.{0,9}$/\n/mg;
    14511439        $line =~ s/\n+/\n/sg;
     
    14571445
    14581446    # output whatever is left
    1459     if ($line =~ /[^\n ]/) {
     1447    if ($line =~ m/[^\n ]/) {
    14601448        print OUT $line;
    14611449        ++$output_line_count;
Note: See TracChangeset for help on using the changeset viewer.