Ignore:
Timestamp:
2010-11-19T13:29:29+13:00 (13 years ago)
Author:
davidb
Message:

Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r23277 r23335  
    312312   
    313313    for (my $i = 0; $i < scalar(@dir); $i++) {
    314     my $subfile = $dir[$i];
     314    my $raw_subfile = $dir[$i];
     315    next if ($raw_subfile =~ m/^\.\.?$/);
     316
    315317    my $this_file_base_dir = $base_dir;
    316     next if ($subfile =~ m/^\.\.?$/);
    317 
    318     # Recursively read each $subfile
    319     print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2);
     318    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     319
     320    # Recursively read each $raw_subfile
     321    print $outhandle "DirectoryPlugin block recurring: $raw_file_subfile\n" if ($verbosity > 2);
    320322   
    321323    #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
     324
    322325    &plugin::file_block_read ($pluginfo, $this_file_base_dir,
    323                   &util::filename_cat($file, $subfile),
     326                  $raw_file_subfile,
    324327                  $block_hash, $metadata, $gli);
    325328   
     
    390393    }
    391394    @dir = readdir (DIR);
     395    map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dir;
    392396    closedir (DIR);
    393397
     
    436440    for (my $i = 0; $i < scalar(@dir); $i++) {
    437441    my $subfile = $dir[$i];
     442    next if ($subfile =~ m/^\.\.?$/);
     443
    438444    my $this_file_base_dir = $base_dir;
    439     next if ($subfile =~ m/^\.\.?$/);
    440     my $file_subfile = &util::filename_cat($file, $subfile);
    441     my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile);
    442     if ($self->file_is_blocked($block_hash,$full_filename)) {
    443         print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2);
     445    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
     446
     447    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     448    my $raw_full_filename = &util::filename_cat($this_file_base_dir, $raw_file_subfile);
     449
     450    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
     451        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for metadata_read\n" if ($verbosity > 2);
    444452        next;
    445453    }
    446454   
    447     # Recursively read each $subfile
    448     print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
     455    # Recursively read each $raw_subfile
     456    print $outhandle "DirectoryPlugin metadata recurring: $raw_subfile\n" if ($verbosity > 2);
    449457   
    450458    &plugin::metadata_read ($pluginfo, $this_file_base_dir,
    451                 $file_subfile,$block_hash,
     459                $raw_file_subfile,$block_hash,
    452460                \@extrametakeys, \%extrametadata,
    453461                \%extrametafile,
     
    497505        last if (!opendir (DIR, $dirname));
    498506        my @dirnow = readdir (DIR);
     507        map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow;
    499508        closedir (DIR);
    500509
     
    518527
    519528    my $subfile = $dir[$i];
    520     my $this_file_base_dir = $base_dir;
    521529    last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
    522530    next if ($subfile =~ /^\.\.?$/);
    523531
    524     my $file_subfile = &util::filename_cat($file, $subfile);
    525     my $full_filename
    526         = &util::filename_cat($this_file_base_dir,$file_subfile);
    527 
    528     if ($self->file_is_blocked($block_hash,$full_filename)) {
    529         print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2);
     532    my $this_file_base_dir = $base_dir;
     533    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
     534
     535    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     536    my $raw_full_filename
     537        = &util::filename_cat($this_file_base_dir,$raw_file_subfile);
     538
     539    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
     540        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for read\n" if ($verbosity > 2);
    530541        next;
    531542    }
    532     #print STDERR "processing $full_filename\n";
     543    #print STDERR "processing $raw_full_filename\n";
    533544    # Follow Windows shortcuts
    534     if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
     545    if ($raw_subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
    535546        require Win32::Shortcut;
    536         my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
     547        my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $raw_subfile));
    537548        if ($shortcut) {
    538549        # The file to be processed is now the target of the shortcut
    539550        $this_file_base_dir = "";
    540551        $file = "";
    541         $subfile = $shortcut->Path;
     552        $raw_subfile = $shortcut->Path;
    542553        }
    543554    }
    544555
    545556    # check for a symlink pointing back to a leading directory
    546     if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
     557    if (-d "$dirname/$raw_subfile" && -l "$dirname/$raw_subfile") {
    547558        # readlink gives a "fatal error" on systems that don't implement
    548559        # symlinks. This assumes the the -l test above would fail on those.
    549         my $linkdest=readlink "$dirname/$subfile";
     560        my $linkdest=readlink "$dirname/$raw_subfile";
    550561        if (!defined ($linkdest)) {
    551562        # system error - file not found?
     
    555566        if ($linkdest =~ m@^[\./\\]+$@ ||
    556567            index($dirname, $linkdest) != -1) {
    557             warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
     568            warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$raw_subfile -> $linkdest)\n";
    558569            next;
    559570            ;
     
    562573    }
    563574
    564     print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
    565 
    566     # Make a copy of $in_metadata to pass to $subfile
     575    print $outhandle "DirectoryPlugin: preparing metadata for $raw_subfile\n" if ($verbosity > 2);
     576
     577    # Make a copy of $in_metadata to pass to $raw_subfile
    567578    my $out_metadata = {};
    568579    &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
    569580
    570581    # check the assocfile_info
    571     if (defined $self->{'assocfile_info'}->{$full_filename}) {
    572         &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename});
    573     }
    574         ## encode the filename as perl5 doesn't handle unicode filenames   
    575        
    576         my $tmpfile = Encode::encode_utf8($subfile);
     582    if (defined $self->{'assocfile_info'}->{$raw_full_filename}) {
     583        &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$raw_full_filename});
     584    }
     585
     586    # $subfile by this point is url-encoded => all ASCII chars => no need to encode as UTF8
     587
    577588    # Next add metadata read in XML files (if it is supplied)
    578589    if ($additionalmetadata == 1) {
    579590        foreach my $filespec (@extrametakeys) {
    580         ## use the utf8 encoded filename to do the filename comparison
    581         if ($tmpfile =~ /^$filespec$/) {
     591        ## use the url-encoded filename to do the filename comparison
     592
     593        if ($subfile =~ /^$filespec$/) {
    582594            print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
    583595            if ($verbosity > 2);
     
    605617        # Look to see if it's a completely new file
    606618
    607         if (!$block_hash->{'new_files'}->{$full_filename}) {
     619        if (!$block_hash->{'new_files'}->{$raw_full_filename}) {
    608620        # Not a new file, must be an existing file
    609621        # Let' see if it's newer than the last import.pl
    610622
    611623
    612         if (! -d $full_filename) {
    613             if (!$block_hash->{'reindex_files'}->{$full_filename}) {
     624        if (! -d $raw_full_filename) {
     625            if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) {
    614626            # filename has been around for longer than inf_timestamp
    615627            print $outhandle "**** Skipping $subfile\n" if ($verbosity >3);
     
    634646   
    635647    $count += &plugin::read ($pluginfo, $this_file_base_dir,
    636                  $file_subfile, $block_hash,
     648                 $raw_file_subfile, $block_hash,
    637649                 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
    638650    }
Note: See TracChangeset for help on using the changeset viewer.