Changeset 23363

Show
Ignore:
Timestamp:
01.12.2010 11:42:27 (8 years ago)
Author:
davidb
Message:

Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

Location:
main/trunk/greenstone2/perllib
Files:
8 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23352 r23363  
    414414    my ($filename_full_path, $block_hash) = @_; 
    415415 
     416    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path); 
     417 
    416418    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) { 
    417419    $block_hash->{'file_blocks'}->{$filename_full_path} = 1; 
     
    426428    my ($filename, $block_hash) = @_; 
    427429 
     430    $filename = &util::upgrade_if_dos_filename($filename); 
     431 
    428432    if ($self->{'cover_image'}) { 
    429433    my $coverfile = $filename; 
    430434    $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/; 
    431     if (!-e $coverfile) { 
     435    if (!&util::fd_exists($coverfile)) { 
    432436        $coverfile =~ s/jpg$/JPG/; 
    433437    }    
    434     if (-e $coverfile) { 
     438    if (&util::fd_exists($coverfile)) { 
    435439        $block_hash->{'file_blocks'}->{$coverfile} = 1; 
    436440    }  
     
    915919    # UTF-8 version of filename 
    916920    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    917     print STDERR "****** Setting Source Metadata given: $octet_file\n"; 
     921    print STDERR "**** Setting Source Metadata given: $octet_file\n"; 
    918922    } 
    919923     
     
    950954     
    951955    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    952     print STDERR "***** saving Source as:             $url_encoded_filename\n"; 
     956    print STDERR "****** saving Source as:             $url_encoded_filename\n"; 
    953957    } 
    954958     
     
    966970 
    967971    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    968     print STDERR "***** saving SourceFile as:         $renamed_raw_url\n"; 
     972    print STDERR "****** saving SourceFile as:         $renamed_raw_url\n"; 
    969973    } 
    970974} 
     
    10141018    # should we move this to read? What about secondary plugins? 
    10151019    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 
    1016     print $outhandle "$self->{'plugin_type'} processing $file\n" 
     1020    my $pp_file = &util::prettyprint_file($base_dir,$file); 
     1021    print $outhandle "$self->{'plugin_type'} processing $pp_file\n" 
    10171022        if $self->{'verbosity'} > 1; 
    10181023 
     
    12961301        my $zip_filename = $metadata->{$field}; 
    12971302        # overwrite the source_path 
    1298         $doc_obj->{'source_path'} = $zip_filename; 
     1303        $doc_obj->set_source_path($zip_filename); 
    12991304        # and set the metadata 
    13001305        $zip_filename = &util::filename_within_collection($zip_filename); 
     
    13371342    my ($doc_obj, $filename) = @_; 
    13381343 
     1344    my $upgraded_filename = &util::upgrade_if_dos_filename($filename); 
     1345 
    13391346    $filename =~ s/\.[^\\\/\.]+$/\.jpg/; 
    1340     if (exists $self->{'covers_missing_cache'}->{$filename}) { 
    1341     # don't stat() for existence eg for multiple document input files 
     1347    $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/; 
     1348 
     1349    if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) { 
     1350    # don't stat() for existence e.g. for multiple document input files 
    13421351    # (eg SplitPlug) 
    13431352    return; 
     
    13461355    my $top_section=$doc_obj->get_top_section(); 
    13471356 
    1348     if (-e $filename) { 
     1357    if (&util::fd_exists($upgraded_filename)) { 
    13491358    $doc_obj->associate_source_file($filename); 
    13501359        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); 
     
    13521361    } else { 
    13531362    my $upper_filename = $filename; 
     1363    my $upgraded_upper_filename = $upgraded_filename; 
     1364 
    13541365    $upper_filename =~ s/jpg$/JPG/; 
    1355     if (-e $upper_filename) { 
     1366    $upgraded_upper_filename =~ s/jpg$/JPG/; 
     1367 
     1368    if (&util::fd_exists($upgraded_upper_filename)) { 
    13561369        $doc_obj->associate_source_file($upper_filename); 
    13571370        $doc_obj->associate_file($upper_filename, "cover.jpg", 
     
    13611374        # file doesn't exist, so record the fact that it's missing so 
    13621375        # we don't stat() again (stat is slow) 
    1363         $self->{'covers_missing_cache'}->{$filename} = 1; 
     1376        $self->{'covers_missing_cache'}->{$upgraded_filename} = 1; 
    13641377    } 
    13651378    } 
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r23352 r23363  
    394394    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 
    395395    # build. so set it manually. 
    396     $doc_obj->{'source_path'} = $filename_full_path; 
     396    $doc_obj->set_source_path($filename_full_path); 
    397397    $doc_obj->set_converted_filename($collect_conv_file); 
    398398 
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r23335 r23363  
    257257    my ($block_hash, $filename_full_path) = @_; 
    258258 
     259    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path); 
     260###    print STDERR "*** DirectoryPlugin::file_is_blocked $filename_full_path\n"; 
     261 
    259262    if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) { 
    260263    $self->{'num_blocked'} ++; 
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23352 r23363  
    186186    # read in file ($text will be in utf8) 
    187187    my $raw_text = ""; 
    188     $self->read_file_no_decoding ($filename_full_path, \$raw_text); 
     188    $self->read_file_no_decoding($filename_full_path, \$raw_text); 
    189189 
    190190    my $textref = \$raw_text; 
     
    192192    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)'; 
    193193    $$textref =~ s/$opencom(.*?)$closecom//gs; 
     194 
     195    # Convert entities to their UTF8 equivalents 
     196    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; 
     197    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8 
     198    $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; 
    194199 
    195200    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; 
     
    209214 
    210215    # remove quotes from link at start and end if necessary 
    211     if ($link=~/^\"/) { 
    212         $link=~s/^\"//; 
    213         $link=~s/\"$//; 
    214     } 
    215  
    216     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 
     216    if ($link =~ m/^\"/) { 
     217        $link =~ s/^\"//; 
     218        $link =~ s/\"$//; 
     219    } 
     220 
     221    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html  
    217222    # some links may just be anchor names 
    218223    next unless ($link =~ /\S+/); 
     
    242247    } 
    243248 
     249     
     250    my $unicode_url_original_filename = decode("utf8",$url_original_filename); 
     251 
     252##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n"; 
     253 
     254    # Allow for possibility of raw byte version (UTF8) and Unicode versions of file 
    244255    $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 
     256    $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1; 
    245257    } 
    246258} 
     
    250262# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded* 
    251263# string already exists in the local folder. 
     264# 
     265# Is the following still true?? 
    252266# Return the original filename corresponding to the parameter URL-encoded filename, and 
    253267# a decoded flag that is set to true iff URL-decoding had to be applied. 
     
    312326    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});  
    313327    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 
    314     # build. so set it manually. 
    315     $doc_obj->{'source_path'} = $filename_full_path; 
     328    # build. So set it manually. 
     329    $doc_obj->set_source_path($filename_full_path); 
    316330    my $collect_conv_file = &util::filename_within_collection($tidy_filename); 
    317331    $doc_obj->set_converted_filename($collect_conv_file); 
     
    387401    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 
    388402 
    389     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    390         print STDERR "***!! file = $file\n"; 
    391         print STDERR "***!! utf8_file = $utf8_file\n"; 
    392     } 
     403#   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     404#       print STDERR "***!! file = $file\n"; 
     405#       print STDERR "***!! utf8_file = $utf8_file\n"; 
     406#   } 
    393407 
    394408 
     
    764778    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 
    765779 
    766  
    767780    # If web page didn't give encoding, then default to utf8 
     781    my $content_encoding= $self->{'content_encoding'} || "utf8"; 
     782 
    768783    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    769         print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n"; 
    770         print STDERR "*****  looking up $file\n"; 
    771     } 
    772  
    773     my $content_encoding= $self->{'content_encoding'} || "utf8"; 
     784        print STDERR "*** Encoding with $content_encoding href: $href\n"; 
     785    } 
     786 
    774787    $href = encode($content_encoding,$href); 
    775788 
     
    807820 
    808821    $filename = &util::filename_cat($base_dir, $filename); 
     822 
    809823    if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) { 
    810824    # we are processing a tidytmp file - want paths to be in import 
     
    827841    $filename = encode($content_encoding, $opt_decode_utf8_filename); 
    828842 
    829  
    830843    # some special processing if the intended filename was converted to utf8, but 
    831844    # the actual file still needs to be renamed 
    832     if (!-e $filename) { 
     845    if (!&util::fd_exists($filename)) { 
    833846    # try the original filename stored in map 
    834847    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    835         print STDERR "***###!! orig filename did not exist: $filename\n"; 
    836     } 
     848        print STDERR "******!! orig filename did not exist: $filename\n"; 
     849    } 
     850 
     851##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n"; 
    837852 
    838853    my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename}; 
    839854 
    840855    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    841         print STDERR "**** Trying for $original_filename\n"; 
     856        print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n"; 
    842857    } 
    843858 
    844859    if (defined $original_filename && -e $original_filename) { 
    845860        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    846             print STDERR "*** found match\n"; 
     861            print STDERR "******   Found match!\n"; 
    847862        } 
    848863        $filename = $original_filename; 
     
    891906    $newname = &util::rename_file($newname, $self->{'file_rename_method'}); 
    892907 
     908### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n"; 
    893909    $doc_obj->associate_file($filename, $newname, undef, $section); 
    894910 
     
    12961312    $self->SUPER::read_file($filename, $encoding, $language, $textref); 
    12971313 
    1298     # Convert entities to their UTF8 equivalents 
     1314    # Convert entities to their Unicode code-point equivalents 
    12991315    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; 
    13001316    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo; 
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r23352 r23363  
    563563    $self->add_dummy_text($doc_obj, $topsection); 
    564564    } 
    565  
    566  
    567565} 
    568566 
  • main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

    r23352 r23363  
    341341    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 
    342342    # build. so set it manually. 
    343     $doc_obj->{'source_path'} = $filename_full_path; 
     343    $doc_obj->set_source_path($filename_full_path); 
    344344    $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file)); 
    345345     
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r23352 r23363  
    122122    # should we move this to read? What about secondary plugins? 
    123123    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 
    124     print $outhandle "$self->{'plugin_type'} processing $file\n" 
     124    my $pp_file = &util::prettyprint_file($base_dir,$file); 
     125    print $outhandle "$self->{'plugin_type'} processing $pp_file\n" 
    125126        if $self->{'verbosity'} > 1; 
    126127 
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r22818 r23363  
    640640    $dir = "" unless defined $dir; 
    641641         
    642      
    643     my $real_filename = $assoc_file_rec->[0]; 
     642    my $utf8_real_filename = $assoc_file_rec->[0]; 
     643 
    644644    # for some reasons the image associate file has / before the full path 
    645     $real_filename =~ s/^\\(.*)/$1/i; 
     645    $utf8_real_filename =~ s/^\\(.*)/$1/i; 
     646 
     647    my $real_filename = &util::utf8_to_real_filename($utf8_real_filename); 
     648 
    646649    if (-e $real_filename) { 
    647650 
     
    653656                     "gsdlassocfile", 
    654657                     "$afile:$assoc_file_rec->[2]:$dir"); 
    655     } elsif ($self->{'verbosity'} > 2) { 
     658    } elsif ($self->{'verbosity'} > 1) { 
    656659        print $outhandle "BasePlugout::process couldn't copy the associated file " . 
    657660        "$real_filename to $afile\n"; 
     
    688691    # for some reasons the image associate file has / before the full path 
    689692    $real_filename =~ s/^\\(.*)/$1/i; 
    690     if (-e $real_filename) { 
     693 
     694    my $raw_filename; 
     695    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     696        # Need to generate DOS version of filename to test in '-e $raw_filename' below 
     697        require Win32; 
     698        my $unicode_filename = Encode::decode("utf8",$real_filename); 
     699###     print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n"; 
     700 
     701        $raw_filename = Win32::GetShortPathName($unicode_filename); 
     702    } 
     703    else { 
     704        $raw_filename = $real_filename; 
     705    } 
     706 
     707    if (-e $raw_filename) { 
    691708 
    692709#       if (defined $collect_dir) { 
     
    701718        $reverse_lookups->{$real_filename} = 1; 
    702719        } 
    703         push(@{$oid_files->{$field}},$full_file); 
     720###     push(@{$oid_files->{$field}},$full_file); 
     721        push(@{$oid_files->{$field}},$raw_filename); 
    704722    } 
    705723    else {