Ignore:
Timestamp:
2010-11-19T13:29:29+13:00 (13 years ago)
Author:
davidb
Message:

Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r22951 r23335  
    181181    my @file_blocks;
    182182   
    183     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
     183    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
     184    $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
    184185
    185186    # read in file ($text will be in utf8)
     
    229230    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
    230231    my $utf8_link = "";
    231     $self->decode_text($link,$encoding,$language,\$utf8_link);
     232    $self->decode_text($link,$content_encoding,$language,\$utf8_link);
    232233
    233234    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
     
    272273       
    273274    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     275
     276    # Lookup content_encoding worked out in file_block pass for this file
     277    # Store it under the local name 'content_encoding' so its nice and
     278    # easy to access
     279    $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
     280
    274281    # get the input file
    275282    my $input_filename = $file;
     
    311318    $self->set_Source_metadata($doc_obj, $filename_no_path);
    312319    }
     320
     321    delete $self->{'store_content_encoding'}->{$filename_full_path};
     322    $self->{'content_encoding'} = undef;
     323
    313324    return ($process_status,$doc_obj);
    314325}
     
    367378    # links, so even if 'file_is_url' is off, still need to store info
    368379
    369     my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
    370     my $utf8_file = $self->filename_to_utf8_metadata($file);
    371     $utf8_file =~ s/&\#095;/_/g;
     380    my ($tailname,$dirname) = &File::Basename::fileparse($file);
     381    print STDERR "***!! file = $file\n";
     382#    my $utf8_file = $self->filename_to_utf8_metadata($file);
     383#    $utf8_file =~ s/&\#095;/_/g;
     384    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
     385    print STDERR "***!! utf8_file = $utf8_file\n";
     386
    372387    my $web_url = "http://";
    373388    if(defined $dirname) { # local directory
     
    535550    $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
    536551    }
     552
    537553    return 1;
    538554}
     
    737753    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
    738754
    739 
    740755    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
    741756    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
     757
     758
     759    # If web page didn't give encoding, then default to utf8
     760    print "*************** looking up $file\n";
     761
     762    my $content_encoding= $self->{'content_encoding'} || "utf8";
     763    $href = encode($content_encoding,$href);
     764
     765    $href = &unicode::raw_filename_to_url_encoded($href);
     766    $href = &unicode::filename_to_url($href);
     767
    742768    &ghtml::urlsafe ($href);
     769    print STDERR "***!!! href=$href\n";   
     770
    743771    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
    744772    } else {
    745     # link is to some other type of file (eg image) so we'll
     773    # link is to some other type of file (e.g., an image) so we'll
    746774    # need to associate that file
    747775    return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
     
    769797    $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
    770798    }
    771     # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
    772     # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
     799
     800    # Replace %XX's in URL with decoded value if required. Note that the
     801    # filename may include the %XX in some situations. If the *original*
     802    # file's name was in URL encoding, the following method will not decode
     803    # it.
    773804    my $utf8_filename = $filename;
    774     $filename = $self->opt_url_decode($utf8_filename);
     805    my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
     806
     807    my $content_encoding= $self->{'content_encoding'} || "utf8";
     808
     809    # The filenames that come through the HTML file have been decoded
     810    # into Unicode aware Perl strings.  Need to convert them back
     811    # to their initial raw-byte encoding to match the file that
     812    # exists on the file system
     813    $filename = encode($content_encoding, $opt_decode_utf8_filename);
     814
    775815
    776816    # some special processing if the intended filename was converted to utf8, but
     
    778818    if (!-e $filename) {
    779819    # try the original filename stored in map
    780     my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
     820    print STDERR "***###!! orig filename did not exist: $filename\n";
     821
     822    my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
     823
     824    print STDERR "**** Trying for $original_filename\n";
     825
    781826    if (defined $original_filename && -e $original_filename) {
     827        print STDERR "*** found match\n";
    782828        $filename = $original_filename;
    783829    }
     
    11181164    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
    11191165    $title =~ s/^\s+//s; # in case title_sub introduced any...
    1120     $doc_obj->add_utf8_metadata ($section, 'Title', $title);
     1166    print STDERR "**** adding Title: ", Encode::encode("utf8",$title), "\n";
     1167    $doc_obj->add_utf8_metadata ($section, "Title", $title);
    11211168    print $outhandle " extracted Title metadata \"$title\" from $from\n"
    11221169        if ($self->{'verbosity'} > 2);
Note: See TracChangeset for help on using the changeset viewer.