Ignore:
Timestamp:
2010-12-01T11:42:27+13:00 (13 years ago)
Author:
davidb
Message:

Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

Location:
main/trunk/greenstone2/perllib
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23352 r23363  
    414414    my ($filename_full_path, $block_hash) = @_;
    415415
     416    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
     417
    416418    if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) {
    417419    $block_hash->{'file_blocks'}->{$filename_full_path} = 1;
     
    426428    my ($filename, $block_hash) = @_;
    427429
     430    $filename = &util::upgrade_if_dos_filename($filename);
     431
    428432    if ($self->{'cover_image'}) {
    429433    my $coverfile = $filename;
    430434    $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
    431     if (!-e $coverfile) {
     435    if (!&util::fd_exists($coverfile)) {
    432436        $coverfile =~ s/jpg$/JPG/;
    433437    }   
    434     if (-e $coverfile) {
     438    if (&util::fd_exists($coverfile)) {
    435439        $block_hash->{'file_blocks'}->{$coverfile} = 1;
    436440    }
     
    915919    # UTF-8 version of filename
    916920    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    917     print STDERR "****** Setting Source Metadata given: $octet_file\n";
     921    print STDERR "**** Setting Source Metadata given: $octet_file\n";
    918922    }
    919923   
     
    950954   
    951955    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    952     print STDERR "***** saving Source as:             $url_encoded_filename\n";
     956    print STDERR "****** saving Source as:             $url_encoded_filename\n";
    953957    }
    954958   
     
    966970
    967971    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    968     print STDERR "***** saving SourceFile as:         $renamed_raw_url\n";
     972    print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
    969973    }
    970974}
     
    10141018    # should we move this to read? What about secondary plugins?
    10151019    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
    1016     print $outhandle "$self->{'plugin_type'} processing $file\n"
     1020    my $pp_file = &util::prettyprint_file($base_dir,$file);
     1021    print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
    10171022        if $self->{'verbosity'} > 1;
    10181023
     
    12961301        my $zip_filename = $metadata->{$field};
    12971302        # overwrite the source_path
    1298         $doc_obj->{'source_path'} = $zip_filename;
     1303        $doc_obj->set_source_path($zip_filename);
    12991304        # and set the metadata
    13001305        $zip_filename = &util::filename_within_collection($zip_filename);
     
    13371342    my ($doc_obj, $filename) = @_;
    13381343
     1344    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
     1345
    13391346    $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
    1340     if (exists $self->{'covers_missing_cache'}->{$filename}) {
    1341     # don't stat() for existence eg for multiple document input files
     1347    $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/;
     1348
     1349    if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) {
     1350    # don't stat() for existence e.g. for multiple document input files
    13421351    # (eg SplitPlug)
    13431352    return;
     
    13461355    my $top_section=$doc_obj->get_top_section();
    13471356
    1348     if (-e $filename) {
     1357    if (&util::fd_exists($upgraded_filename)) {
    13491358    $doc_obj->associate_source_file($filename);
    13501359        $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
     
    13521361    } else {
    13531362    my $upper_filename = $filename;
     1363    my $upgraded_upper_filename = $upgraded_filename;
     1364
    13541365    $upper_filename =~ s/jpg$/JPG/;
    1355     if (-e $upper_filename) {
     1366    $upgraded_upper_filename =~ s/jpg$/JPG/;
     1367
     1368    if (&util::fd_exists($upgraded_upper_filename)) {
    13561369        $doc_obj->associate_source_file($upper_filename);
    13571370        $doc_obj->associate_file($upper_filename, "cover.jpg",
     
    13611374        # file doesn't exist, so record the fact that it's missing so
    13621375        # we don't stat() again (stat is slow)
    1363         $self->{'covers_missing_cache'}->{$filename} = 1;
     1376        $self->{'covers_missing_cache'}->{$upgraded_filename} = 1;
    13641377    }
    13651378    }
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r23352 r23363  
    394394    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
    395395    # build. so set it manually.
    396     $doc_obj->{'source_path'} = $filename_full_path;
     396    $doc_obj->set_source_path($filename_full_path);
    397397    $doc_obj->set_converted_filename($collect_conv_file);
    398398
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r23335 r23363  
    257257    my ($block_hash, $filename_full_path) = @_;
    258258
     259    $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path);
     260###    print STDERR "*** DirectoryPlugin::file_is_blocked $filename_full_path\n";
     261
    259262    if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) {
    260263    $self->{'num_blocked'} ++;
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23352 r23363  
    186186    # read in file ($text will be in utf8)
    187187    my $raw_text = "";
    188     $self->read_file_no_decoding ($filename_full_path, \$raw_text);
     188    $self->read_file_no_decoding($filename_full_path, \$raw_text);
    189189
    190190    my $textref = \$raw_text;
     
    192192    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
    193193    $$textref =~ s/$opencom(.*?)$closecom//gs;
     194
     195    # Convert entities to their UTF8 equivalents
     196    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
     197    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
     198    $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
    194199
    195200    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
     
    209214
    210215    # remove quotes from link at start and end if necessary
    211     if ($link=~/^\"/) {
    212         $link=~s/^\"//;
    213         $link=~s/\"$//;
    214     }
    215 
    216     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
     216    if ($link =~ m/^\"/) {
     217        $link =~ s/^\"//;
     218        $link =~ s/\"$//;
     219    }
     220
     221    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 
    217222    # some links may just be anchor names
    218223    next unless ($link =~ /\S+/);
     
    242247    }
    243248
     249   
     250    my $unicode_url_original_filename = decode("utf8",$url_original_filename);
     251
     252##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
     253
     254    # Allow for possibility of raw byte version (UTF8) and Unicode versions of file
    244255    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
     256    $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
    245257    }
    246258}
     
    250262# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
    251263# string already exists in the local folder.
     264#
     265# Is the following still true??
    252266# Return the original filename corresponding to the parameter URL-encoded filename, and
    253267# a decoded flag that is set to true iff URL-decoding had to be applied.
     
    312326    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
    313327    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
    314     # build. so set it manually.
    315     $doc_obj->{'source_path'} = $filename_full_path;
     328    # build. So set it manually.
     329    $doc_obj->set_source_path($filename_full_path);
    316330    my $collect_conv_file = &util::filename_within_collection($tidy_filename);
    317331    $doc_obj->set_converted_filename($collect_conv_file);
     
    387401    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
    388402
    389     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    390         print STDERR "***!! file = $file\n";
    391         print STDERR "***!! utf8_file = $utf8_file\n";
    392     }
     403#   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     404#       print STDERR "***!! file = $file\n";
     405#       print STDERR "***!! utf8_file = $utf8_file\n";
     406#   }
    393407
    394408
     
    764778    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
    765779
    766 
    767780    # If web page didn't give encoding, then default to utf8
     781    my $content_encoding= $self->{'content_encoding'} || "utf8";
     782
    768783    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    769         print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n";
    770         print STDERR "*****  looking up $file\n";
    771     }
    772 
    773     my $content_encoding= $self->{'content_encoding'} || "utf8";
     784        print STDERR "*** Encoding with $content_encoding href: $href\n";
     785    }
     786
    774787    $href = encode($content_encoding,$href);
    775788
     
    807820
    808821    $filename = &util::filename_cat($base_dir, $filename);
     822
    809823    if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
    810824    # we are processing a tidytmp file - want paths to be in import
     
    827841    $filename = encode($content_encoding, $opt_decode_utf8_filename);
    828842
    829 
    830843    # some special processing if the intended filename was converted to utf8, but
    831844    # the actual file still needs to be renamed
    832     if (!-e $filename) {
     845    if (!&util::fd_exists($filename)) {
    833846    # try the original filename stored in map
    834847    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    835         print STDERR "***###!! orig filename did not exist: $filename\n";
    836     }
     848        print STDERR "******!! orig filename did not exist: $filename\n";
     849    }
     850
     851##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
    837852
    838853    my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
    839854
    840855    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    841         print STDERR "**** Trying for $original_filename\n";
     856        print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n";
    842857    }
    843858
    844859    if (defined $original_filename && -e $original_filename) {
    845860        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    846             print STDERR "*** found match\n";
     861            print STDERR "******   Found match!\n";
    847862        }
    848863        $filename = $original_filename;
     
    891906    $newname = &util::rename_file($newname, $self->{'file_rename_method'});
    892907
     908### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
    893909    $doc_obj->associate_file($filename, $newname, undef, $section);
    894910
     
    12961312    $self->SUPER::read_file($filename, $encoding, $language, $textref);
    12971313
    1298     # Convert entities to their UTF8 equivalents
     1314    # Convert entities to their Unicode code-point equivalents
    12991315    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
    13001316    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r23352 r23363  
    563563    $self->add_dummy_text($doc_obj, $topsection);
    564564    }
    565 
    566 
    567565}
    568566
  • main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

    r23352 r23363  
    341341    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
    342342    # build. so set it manually.
    343     $doc_obj->{'source_path'} = $filename_full_path;
     343    $doc_obj->set_source_path($filename_full_path);
    344344    $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file));
    345345   
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r23352 r23363  
    122122    # should we move this to read? What about secondary plugins?
    123123    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
    124     print $outhandle "$self->{'plugin_type'} processing $file\n"
     124    my $pp_file = &util::prettyprint_file($base_dir,$file);
     125    print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
    125126        if $self->{'verbosity'} > 1;
    126127
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r22818 r23363  
    640640    $dir = "" unless defined $dir;
    641641       
    642    
    643     my $real_filename = $assoc_file_rec->[0];
     642    my $utf8_real_filename = $assoc_file_rec->[0];
     643
    644644    # for some reasons the image associate file has / before the full path
    645     $real_filename =~ s/^\\(.*)/$1/i;
     645    $utf8_real_filename =~ s/^\\(.*)/$1/i;
     646
     647    my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
     648
    646649    if (-e $real_filename) {
    647650
     
    653656                     "gsdlassocfile",
    654657                     "$afile:$assoc_file_rec->[2]:$dir");
    655     } elsif ($self->{'verbosity'} > 2) {
     658    } elsif ($self->{'verbosity'} > 1) {
    656659        print $outhandle "BasePlugout::process couldn't copy the associated file " .
    657660        "$real_filename to $afile\n";
     
    688691    # for some reasons the image associate file has / before the full path
    689692    $real_filename =~ s/^\\(.*)/$1/i;
    690     if (-e $real_filename) {
     693
     694    my $raw_filename;
     695    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     696        # Need to generate DOS version of filename to test in '-e $raw_filename' below
     697        require Win32;
     698        my $unicode_filename = Encode::decode("utf8",$real_filename);
     699###     print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n";
     700
     701        $raw_filename = Win32::GetShortPathName($unicode_filename);
     702    }
     703    else {
     704        $raw_filename = $real_filename;
     705    }
     706
     707    if (-e $raw_filename) {
    691708
    692709#       if (defined $collect_dir) {
     
    701718        $reverse_lookups->{$real_filename} = 1;
    702719        }
    703         push(@{$oid_files->{$field}},$full_file);
     720###     push(@{$oid_files->{$field}},$full_file);
     721        push(@{$oid_files->{$field}},$raw_filename);
    704722    }
    705723    else {
Note: See TracChangeset for help on using the changeset viewer.