Changeset 23335


Ignore:
Timestamp:
2010-11-19T13:29:29+13:00 (13 years ago)
Author:
davidb
Message:

Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23279 r23335  
    3131
    3232use File::Basename;
     33use Encode;
    3334
    3435use encodings;
     
    785786}
    786787
    787 # is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
     788
     789
     790sub deduce_filename_encoding
     791{
     792    my $self = shift (@_); 
     793    my ($file,$metadata) = @_;
     794
     795    my $gs_filename_encoding = $metadata->{"gs.filename_encoding"};
     796    my $deduced_filename_encoding = undef;
     797   
     798    # Start by looking for manually assigned metadata
     799    if (defined $gs_filename_encoding) {
     800    if (ref ($gs_filename_encoding) eq "ARRAY") {
     801        my $outhandle = $self->{'outhandle'};
     802       
     803        $deduced_filename_encoding = $gs_filename_encoding->[0];
     804       
     805        my $num_vals = scalar(@$gs_filename_encoding);
     806        if ($num_vals>1) {
     807        print $outhandle "Warning: gs.filename_encoding multiply defined for $file\n";
     808        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
     809        }
     810    }
     811    else {
     812        $deduced_filename_encoding = $gs_filename_encoding;
     813    }
     814    }
     815   
     816#   binmode(STDERR,":utf8");
     817   
     818#   print STDERR "**** file = $file\n";
     819#   print STDERR "**** debug file = ", &unicode::debug_unicode_string($file),"\n";;
     820   
     821#   print STDERR "******* dfe = $deduced_filename_encoding\n";
     822   
     823    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     824    # Look to see if plugin specifies this value
     825    }
     826   
     827    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     828    # See if we can determine the file system encoding through locale
     829    # Unix only ?
     830   
     831    # if locale shows us filesystem is utf8, check to see filename is consistent
     832    # => if not, then we have an "alien" filename on our hands
     833    }
     834   
     835   
     836    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     837    # Last chance, apply textcat to deduce filename encoding
     838    }
     839   
     840    return $deduced_filename_encoding;
     841}
     842
     843
     844
     845
     846# Notionally written to be called once for each document, it is however safe to
     847# call multiple times (as in the case of ImagePlugin) which calls this later on
     848# after the original image has potentially been converted to a *new* source image
     849# format (e.g. TIFF to PNG)
     850
    788851sub set_Source_metadata {
    789852    my $self = shift (@_); 
    790     my ($doc_obj, $filename_no_path, $file_encoding) = @_;
     853    my ($doc_obj, $raw_file, $filename_encoding) = @_;
     854
     855    # 1. Sets the filename (Source) for display encoded as Unicode if possible,
     856    #    and (as a fallback) using %xx if not for non-ascii chars
     857    # 2. Sets the url ref (SourceFile) to the URL encoded version
     858    #    of filename for generated files
    791859
    792860    my $top_section = $doc_obj->get_top_section();
    793861   
    794862    # UTF-8 version of filename
    795     my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
     863    print STDERR "**** setting Source Metadata given: $raw_file\n";
     864
     865##    my $filemeta = $self->filename_to_utf8_metadata($raw_file, $filename_encoding);
     866
     867    my $url_encoded_filename;
     868    if (defined $filename_encoding) {
     869    # => Generate a pretty print version of filename that is mapped to Unicode
     870
     871    # Use filename_encoding to map raw filename to a Perl unicode-aware string
     872    $url_encoded_filename = decode($filename_encoding,$raw_file);
     873
     874    print STDERR "@@@@ pretty print using $filename_encoding: ", encode("utf8",$url_encoded_filename),"\n";
     875    }
     876    else {
     877    # otherwise generate %xx encoded version of filename for char > 127
     878    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($raw_file);
     879    }
     880
     881    print STDERR "***** saving Source as:             $url_encoded_filename\n";
     882
    796883   
    797884    # Source is the UTF8 display name - not necessarily the name of the file on the system
    798     $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
    799 
    800     $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
     885    $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename);
     886
     887    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
    801888    # If using URL encoding, then SourceFile is the url-reference to url-encoded
    802     # filemeta: it's a url that refers to the actual file on the system
    803     $filemeta = &unicode::filename_to_url($filemeta);
    804 
    805     $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
     889    # renamed_raw_url: it's a url that refers to the actual file on the system
     890    my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
     891
     892    $doc_obj->set_utf8_metadata_element($top_section, "SourceFile",
     893                    $renamed_raw_url);
     894
     895    print STDERR "***** saving SourceFile as:         $renamed_raw_url\n";
    806896}
    807897   
     
    863953 
    864954
    865     # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
    866     # of the UTF8 filename (SourceFile) for generated files
    867     $self->set_Source_metadata($doc_obj, $filename_no_path);
    868 
     955    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata);
     956    $self->set_Source_metadata($doc_obj,$filename_no_path,$filename_encoding);
    869957
    870958    # plugin specific stuff - what args do we need here??
     
    10321120    my $file_derived_title = $file;
    10331121    $file_derived_title =~ s/_/ /g;
    1034     $file_derived_title =~ s/\.[^.]$//;
     1122    $file_derived_title =~ s/\.[^.]+$//;
    10351123
    10361124    return $file_derived_title;
     
    10441132
    10451133    if (!defined $doc_obj->get_metadata_element ($section, "Title")
    1046     or $doc_obj->get_metadata_element($section, "Title") eq "") {
    1047 
    1048     my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
     1134    || $doc_obj->get_metadata_element($section, "Title") eq "") {
     1135
     1136    my $source_file = $doc_obj->get_metadata_element($section, "Source");
     1137    my $file_derived_title;
     1138    if (defined $source_file) {
     1139        $file_derived_title =  $self->filename_based_title($source_file);
     1140    }
     1141    else {
     1142        # pp = pretty print
     1143        my $pp_file = (defined $source_file) ? $source_file : $file;
     1144
     1145        my $raw_title = $self->filename_based_title($file);
     1146        my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
     1147    }
     1148
     1149
    10491150    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
    10501151        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r23277 r23335  
    312312   
    313313    for (my $i = 0; $i < scalar(@dir); $i++) {
    314     my $subfile = $dir[$i];
     314    my $raw_subfile = $dir[$i];
     315    next if ($raw_subfile =~ m/^\.\.?$/);
     316
    315317    my $this_file_base_dir = $base_dir;
    316     next if ($subfile =~ m/^\.\.?$/);
    317 
    318     # Recursively read each $subfile
    319     print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2);
     318    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     319
     320    # Recursively read each $raw_subfile
     321    print $outhandle "DirectoryPlugin block recurring: $raw_file_subfile\n" if ($verbosity > 2);
    320322   
    321323    #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir,
     324
    322325    &plugin::file_block_read ($pluginfo, $this_file_base_dir,
    323                   &util::filename_cat($file, $subfile),
     326                  $raw_file_subfile,
    324327                  $block_hash, $metadata, $gli);
    325328   
     
    390393    }
    391394    @dir = readdir (DIR);
     395    map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dir;
    392396    closedir (DIR);
    393397
     
    436440    for (my $i = 0; $i < scalar(@dir); $i++) {
    437441    my $subfile = $dir[$i];
     442    next if ($subfile =~ m/^\.\.?$/);
     443
    438444    my $this_file_base_dir = $base_dir;
    439     next if ($subfile =~ m/^\.\.?$/);
    440     my $file_subfile = &util::filename_cat($file, $subfile);
    441     my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile);
    442     if ($self->file_is_blocked($block_hash,$full_filename)) {
    443         print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2);
     445    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
     446
     447    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     448    my $raw_full_filename = &util::filename_cat($this_file_base_dir, $raw_file_subfile);
     449
     450    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
     451        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for metadata_read\n" if ($verbosity > 2);
    444452        next;
    445453    }
    446454   
    447     # Recursively read each $subfile
    448     print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2);
     455    # Recursively read each $raw_subfile
     456    print $outhandle "DirectoryPlugin metadata recurring: $raw_subfile\n" if ($verbosity > 2);
    449457   
    450458    &plugin::metadata_read ($pluginfo, $this_file_base_dir,
    451                 $file_subfile,$block_hash,
     459                $raw_file_subfile,$block_hash,
    452460                \@extrametakeys, \%extrametadata,
    453461                \%extrametafile,
     
    497505        last if (!opendir (DIR, $dirname));
    498506        my @dirnow = readdir (DIR);
     507        map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow;
    499508        closedir (DIR);
    500509
     
    518527
    519528    my $subfile = $dir[$i];
    520     my $this_file_base_dir = $base_dir;
    521529    last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs);
    522530    next if ($subfile =~ /^\.\.?$/);
    523531
    524     my $file_subfile = &util::filename_cat($file, $subfile);
    525     my $full_filename
    526         = &util::filename_cat($this_file_base_dir,$file_subfile);
    527 
    528     if ($self->file_is_blocked($block_hash,$full_filename)) {
    529         print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2);
     532    my $this_file_base_dir = $base_dir;
     533    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile);
     534
     535    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile);
     536    my $raw_full_filename
     537        = &util::filename_cat($this_file_base_dir,$raw_file_subfile);
     538
     539    if ($self->file_is_blocked($block_hash,$raw_full_filename)) {
     540        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for read\n" if ($verbosity > 2);
    530541        next;
    531542    }
    532     #print STDERR "processing $full_filename\n";
     543    #print STDERR "processing $raw_full_filename\n";
    533544    # Follow Windows shortcuts
    534     if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
     545    if ($raw_subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) {
    535546        require Win32::Shortcut;
    536         my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile));
     547        my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $raw_subfile));
    537548        if ($shortcut) {
    538549        # The file to be processed is now the target of the shortcut
    539550        $this_file_base_dir = "";
    540551        $file = "";
    541         $subfile = $shortcut->Path;
     552        $raw_subfile = $shortcut->Path;
    542553        }
    543554    }
    544555
    545556    # check for a symlink pointing back to a leading directory
    546     if (-d "$dirname/$subfile" && -l "$dirname/$subfile") {
     557    if (-d "$dirname/$raw_subfile" && -l "$dirname/$raw_subfile") {
    547558        # readlink gives a "fatal error" on systems that don't implement
    548559        # symlinks. This assumes the the -l test above would fail on those.
    549         my $linkdest=readlink "$dirname/$subfile";
     560        my $linkdest=readlink "$dirname/$raw_subfile";
    550561        if (!defined ($linkdest)) {
    551562        # system error - file not found?
     
    555566        if ($linkdest =~ m@^[\./\\]+$@ ||
    556567            index($dirname, $linkdest) != -1) {
    557             warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n";
     568            warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$raw_subfile -> $linkdest)\n";
    558569            next;
    559570            ;
     
    562573    }
    563574
    564     print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2);
    565 
    566     # Make a copy of $in_metadata to pass to $subfile
     575    print $outhandle "DirectoryPlugin: preparing metadata for $raw_subfile\n" if ($verbosity > 2);
     576
     577    # Make a copy of $in_metadata to pass to $raw_subfile
    567578    my $out_metadata = {};
    568579    &metadatautil::combine_metadata_structures($out_metadata, $in_metadata);
    569580
    570581    # check the assocfile_info
    571     if (defined $self->{'assocfile_info'}->{$full_filename}) {
    572         &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename});
    573     }
    574         ## encode the filename as perl5 doesn't handle unicode filenames   
    575        
    576         my $tmpfile = Encode::encode_utf8($subfile);
     582    if (defined $self->{'assocfile_info'}->{$raw_full_filename}) {
     583        &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$raw_full_filename});
     584    }
     585
     586    # $subfile by this point is url-encoded => all ASCII chars => no need to encode as UTF8
     587
    577588    # Next add metadata read in XML files (if it is supplied)
    578589    if ($additionalmetadata == 1) {
    579590        foreach my $filespec (@extrametakeys) {
    580         ## use the utf8 encoded filename to do the filename comparison
    581         if ($tmpfile =~ /^$filespec$/) {
     591        ## use the url-encoded filename to do the filename comparison
     592
     593        if ($subfile =~ /^$filespec$/) {
    582594            print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"
    583595            if ($verbosity > 2);
     
    605617        # Look to see if it's a completely new file
    606618
    607         if (!$block_hash->{'new_files'}->{$full_filename}) {
     619        if (!$block_hash->{'new_files'}->{$raw_full_filename}) {
    608620        # Not a new file, must be an existing file
    609621        # Let' see if it's newer than the last import.pl
    610622
    611623
    612         if (! -d $full_filename) {
    613             if (!$block_hash->{'reindex_files'}->{$full_filename}) {
     624        if (! -d $raw_full_filename) {
     625            if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) {
    614626            # filename has been around for longer than inf_timestamp
    615627            print $outhandle "**** Skipping $subfile\n" if ($verbosity >3);
     
    634646   
    635647    $count += &plugin::read ($pluginfo, $this_file_base_dir,
    636                  $file_subfile, $block_hash,
     648                 $raw_file_subfile, $block_hash,
    637649                 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli);
    638650    }
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r22951 r23335  
    181181    my @file_blocks;
    182182   
    183     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
     183    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
     184    $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
    184185
    185186    # read in file ($text will be in utf8)
     
    229230    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
    230231    my $utf8_link = "";
    231     $self->decode_text($link,$encoding,$language,\$utf8_link);
     232    $self->decode_text($link,$content_encoding,$language,\$utf8_link);
    232233
    233234    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
     
    272273       
    273274    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     275
     276    # Lookup content_encoding worked out in file_block pass for this file
     277    # Store it under the local name 'content_encoding' so its nice and
     278    # easy to access
     279    $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
     280
    274281    # get the input file
    275282    my $input_filename = $file;
     
    311318    $self->set_Source_metadata($doc_obj, $filename_no_path);
    312319    }
     320
     321    delete $self->{'store_content_encoding'}->{$filename_full_path};
     322    $self->{'content_encoding'} = undef;
     323
    313324    return ($process_status,$doc_obj);
    314325}
     
    367378    # links, so even if 'file_is_url' is off, still need to store info
    368379
    369     my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
    370     my $utf8_file = $self->filename_to_utf8_metadata($file);
    371     $utf8_file =~ s/&\#095;/_/g;
     380    my ($tailname,$dirname) = &File::Basename::fileparse($file);
     381    print STDERR "***!! file = $file\n";
     382#    my $utf8_file = $self->filename_to_utf8_metadata($file);
     383#    $utf8_file =~ s/&\#095;/_/g;
     384    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
     385    print STDERR "***!! utf8_file = $utf8_file\n";
     386
    372387    my $web_url = "http://";
    373388    if(defined $dirname) { # local directory
     
    535550    $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
    536551    }
     552
    537553    return 1;
    538554}
     
    737753    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
    738754
    739 
    740755    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
    741756    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
     757
     758
     759    # If web page didn't give encoding, then default to utf8
     760    print "*************** looking up $file\n";
     761
     762    my $content_encoding= $self->{'content_encoding'} || "utf8";
     763    $href = encode($content_encoding,$href);
     764
     765    $href = &unicode::raw_filename_to_url_encoded($href);
     766    $href = &unicode::filename_to_url($href);
     767
    742768    &ghtml::urlsafe ($href);
     769    print STDERR "***!!! href=$href\n";   
     770
    743771    return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back;
    744772    } else {
    745     # link is to some other type of file (eg image) so we'll
     773    # link is to some other type of file (e.g., an image) so we'll
    746774    # need to associate that file
    747775    return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back;
     
    769797    $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/;
    770798    }
    771     # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
    772     # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
     799
     800    # Replace %XX's in URL with decoded value if required. Note that the
     801    # filename may include the %XX in some situations. If the *original*
     802    # file's name was in URL encoding, the following method will not decode
     803    # it.
    773804    my $utf8_filename = $filename;
    774     $filename = $self->opt_url_decode($utf8_filename);
     805    my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
     806
     807    my $content_encoding= $self->{'content_encoding'} || "utf8";
     808
     809    # The filenames that come through the HTML file have been decoded
     810    # into Unicode aware Perl strings.  Need to convert them back
     811    # to their initial raw-byte encoding to match the file that
     812    # exists on the file system
     813    $filename = encode($content_encoding, $opt_decode_utf8_filename);
     814
    775815
    776816    # some special processing if the intended filename was converted to utf8, but
     
    778818    if (!-e $filename) {
    779819    # try the original filename stored in map
    780     my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
     820    print STDERR "***###!! orig filename did not exist: $filename\n";
     821
     822    my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
     823
     824    print STDERR "**** Trying for $original_filename\n";
     825
    781826    if (defined $original_filename && -e $original_filename) {
     827        print STDERR "*** found match\n";
    782828        $filename = $original_filename;
    783829    }
     
    11181164    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'});
    11191165    $title =~ s/^\s+//s; # in case title_sub introduced any...
    1120     $doc_obj->add_utf8_metadata ($section, 'Title', $title);
     1166    print STDERR "**** adding Title: ", Encode::encode("utf8",$title), "\n";
     1167    $doc_obj->add_utf8_metadata ($section, "Title", $title);
    11211168    print $outhandle " extracted Title metadata \"$title\" from $from\n"
    11221169        if ($self->{'verbosity'} > 2);
  • main/trunk/greenstone2/perllib/plugins/ImageConverter.pm

    r22663 r23335  
    152152sub generate_images {
    153153    my $self = shift(@_);
    154     my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_;
     154    my ($filename_full_path, $filename_no_path, $doc_obj, $section, $filename_encoding) = @_;
    155155
    156156    # check image magick status
     
    196196
    197197    if ($converttotype ne "" && $filename_full_path !~ m/$converttotype$/) {
    198     #    $doc_obj->add_utf8_metadata($section, "Image", $utf8_filename_meta);
     198#   #    $doc_obj->add_utf8_metadata($section, "Image", $utf8_filename_meta);
    199199
    200200    my ($result, $converted_filename_full_path)
     
    214214
    215215    # here we overwrite the original with the potentially converted one
    216     $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image
    217     $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image
     216#    $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image
     217#    $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image
     218
     219#    $self->set_Source_metadata($doc_obj,$url_to_filename_no_path,undef);
     220
     221    $self->set_Source_metadata($doc_obj,&unicode::url_decode($filename_no_path),
     222                   $filename_encoding);
     223
    218224
    219225    # use identify to get info about the (possibly converted) image
  • main/trunk/greenstone2/perllib/plugins/ImagePlugin.pm

    r22663 r23335  
    110110    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    111111
    112 
    113112    my $outhandle = $self->{'outhandle'};
    114113    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
     
    116115    if ($self->{'image_conversion_available'} == 1)
    117116    {
    118     my $utf8_filename_no_path = $self->filepath_to_utf8($filename_no_path);
    119     my $url_encoded_filename = &util::rename_file($utf8_filename_no_path, $self->{'file_rename_method'});
    120     $self->generate_images($filename_full_path, $url_encoded_filename, $doc_obj, $doc_obj->get_top_section()); # should we check the return value?
     117    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata);
     118
     119#   my $utf8_filename_no_path = $self->filepath_to_utf8($filename_no_path);
     120#   my $url_encoded_filename = &util::rename_file($utf8_filename_no_path, $self->{'file_rename_method'});
     121
     122#   $self->generate_images($filename_full_path, $url_encoded_filename,
     123#                  $doc_obj, $doc_obj->get_top_section()); # should we check the return value?
     124
     125    $filename_no_path = &unicode::raw_filename_to_url_encoded($filename_no_path);
     126
     127    # should we check the return value?
     128    $self->generate_images($filename_full_path, $filename_no_path,
     129                   $doc_obj, $doc_obj->get_top_section(),$filename_encoding);
     130
    121131    }
    122132    else
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r22857 r23335  
    128128
    129129    # Do encoding stuff
    130     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
     130    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
    131131    if ($self->{'verbosity'} > 2) {
    132     print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n";
     132    print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n";
    133133    }
    134134
     
    140140    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
    141141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
    142     $self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
     142
     143    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata);
     144    $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
    143145
    144146    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
    145     $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
     147    $doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding);
    146148   
    147149    # read in file ($text will be in utf8)
    148150    my $text = "";
    149     $self->read_file ($filename_full_path, $encoding, $language, \$text);
     151    $self->read_file ($filename_full_path, $content_encoding, $language, \$text);
    150152
    151153    if (!length ($text)) {
Note: See TracChangeset for help on using the changeset viewer.