Changeset 23335

Show
Ignore:
Timestamp:
19.11.2010 13:29:29 (9 years ago)
Author:
davidb
Message:

Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
6 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23279 r23335  
    3131 
    3232use File::Basename; 
     33use Encode; 
    3334 
    3435use encodings; 
     
    785786} 
    786787 
    787 # is there ever only one Source? Sometimes this will be called twice, for images etc that are converted. 
     788 
     789 
     790sub deduce_filename_encoding 
     791{ 
     792    my $self = shift (@_);   
     793    my ($file,$metadata) = @_; 
     794 
     795    my $gs_filename_encoding = $metadata->{"gs.filename_encoding"}; 
     796    my $deduced_filename_encoding = undef; 
     797     
     798    # Start by looking for manually assigned metadata 
     799    if (defined $gs_filename_encoding) { 
     800    if (ref ($gs_filename_encoding) eq "ARRAY") { 
     801        my $outhandle = $self->{'outhandle'}; 
     802         
     803        $deduced_filename_encoding = $gs_filename_encoding->[0]; 
     804         
     805        my $num_vals = scalar(@$gs_filename_encoding); 
     806        if ($num_vals>1) { 
     807        print $outhandle "Warning: gs.filename_encoding multiply defined for $file\n"; 
     808        print $outhandle "         Selecting first value: $deduced_filename_encoding\n"; 
     809        } 
     810    }  
     811    else { 
     812        $deduced_filename_encoding = $gs_filename_encoding; 
     813    } 
     814    } 
     815     
     816#   binmode(STDERR,":utf8"); 
     817     
     818#   print STDERR "**** file = $file\n"; 
     819#   print STDERR "**** debug file = ", &unicode::debug_unicode_string($file),"\n";; 
     820     
     821#   print STDERR "******* dfe = $deduced_filename_encoding\n"; 
     822     
     823    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
     824    # Look to see if plugin specifies this value 
     825    } 
     826     
     827    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
     828    # See if we can determine the file system encoding through locale 
     829    # Unix only ? 
     830     
     831    # if locale shows us filesystem is utf8, check to see filename is consistent 
     832    # => if not, then we have an "alien" filename on our hands 
     833    } 
     834     
     835     
     836    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
     837    # Last chance, apply textcat to deduce filename encoding 
     838    } 
     839     
     840    return $deduced_filename_encoding; 
     841} 
     842 
     843 
     844 
     845 
     846# Notionally written to be called once for each document, it is however safe to 
     847# call multiple times (as in the case of ImagePlugin) which calls this later on 
     848# after the original image has potentially been converted to a *new* source image 
     849# format (e.g. TIFF to PNG) 
     850 
    788851sub set_Source_metadata { 
    789852    my $self = shift (@_);   
    790     my ($doc_obj, $filename_no_path, $file_encoding) = @_; 
     853    my ($doc_obj, $raw_file, $filename_encoding) = @_; 
     854 
     855    # 1. Sets the filename (Source) for display encoded as Unicode if possible, 
     856    #    and (as a fallback) using %xx if not for non-ascii chars 
     857    # 2. Sets the url ref (SourceFile) to the URL encoded version 
     858    #    of filename for generated files 
    791859 
    792860    my $top_section = $doc_obj->get_top_section(); 
    793861     
    794862    # UTF-8 version of filename 
    795     my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding); 
     863    print STDERR "**** setting Source Metadata given: $raw_file\n"; 
     864 
     865##    my $filemeta = $self->filename_to_utf8_metadata($raw_file, $filename_encoding); 
     866 
     867    my $url_encoded_filename; 
     868    if (defined $filename_encoding) { 
     869    # => Generate a pretty print version of filename that is mapped to Unicode 
     870 
     871    # Use filename_encoding to map raw filename to a Perl unicode-aware string  
     872    $url_encoded_filename = decode($filename_encoding,$raw_file); 
     873 
     874    print STDERR "@@@@ pretty print using $filename_encoding: ", encode("utf8",$url_encoded_filename),"\n"; 
     875    } 
     876    else { 
     877    # otherwise generate %xx encoded version of filename for char > 127 
     878    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($raw_file); 
     879    } 
     880 
     881    print STDERR "***** saving Source as:             $url_encoded_filename\n"; 
     882 
    796883     
    797884    # Source is the UTF8 display name - not necessarily the name of the file on the system 
    798     $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);  
    799  
    800     $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'}); 
     885    $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename);  
     886 
     887    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'}); 
    801888    # If using URL encoding, then SourceFile is the url-reference to url-encoded 
    802     # filemeta: it's a url that refers to the actual file on the system 
    803     $filemeta = &unicode::filename_to_url($filemeta); 
    804  
    805     $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta); 
     889    # renamed_raw_url: it's a url that refers to the actual file on the system 
     890    my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file); 
     891 
     892    $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", 
     893                    $renamed_raw_url); 
     894 
     895    print STDERR "***** saving SourceFile as:         $renamed_raw_url\n"; 
    806896} 
    807897    
     
    863953  
    864954 
    865     # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version 
    866     # of the UTF8 filename (SourceFile) for generated files 
    867     $self->set_Source_metadata($doc_obj, $filename_no_path); 
    868  
     955    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata); 
     956    $self->set_Source_metadata($doc_obj,$filename_no_path,$filename_encoding); 
    869957 
    870958    # plugin specific stuff - what args do we need here?? 
     
    10321120    my $file_derived_title = $file; 
    10331121    $file_derived_title =~ s/_/ /g; 
    1034     $file_derived_title =~ s/\.[^.]$//; 
     1122    $file_derived_title =~ s/\.[^.]+$//; 
    10351123 
    10361124    return $file_derived_title; 
     
    10441132 
    10451133    if (!defined $doc_obj->get_metadata_element ($section, "Title")  
    1046     or $doc_obj->get_metadata_element($section, "Title") eq "") { 
    1047  
    1048     my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file)); 
     1134    || $doc_obj->get_metadata_element($section, "Title") eq "") { 
     1135 
     1136    my $source_file = $doc_obj->get_metadata_element($section, "Source"); 
     1137    my $file_derived_title; 
     1138    if (defined $source_file) { 
     1139        $file_derived_title =  $self->filename_based_title($source_file); 
     1140    } 
     1141    else { 
     1142        # pp = pretty print 
     1143        my $pp_file = (defined $source_file) ? $source_file : $file; 
     1144 
     1145        my $raw_title = $self->filename_based_title($file); 
     1146        my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title); 
     1147    } 
     1148 
     1149 
    10491150    if (!defined $doc_obj->get_metadata_element ($section, "Title")) { 
    10501151        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title); 
  • main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm

    r23277 r23335  
    312312     
    313313    for (my $i = 0; $i < scalar(@dir); $i++) { 
    314     my $subfile = $dir[$i]; 
     314    my $raw_subfile = $dir[$i]; 
     315    next if ($raw_subfile =~ m/^\.\.?$/); 
     316 
    315317    my $this_file_base_dir = $base_dir; 
    316     next if ($subfile =~ m/^\.\.?$/); 
    317  
    318     # Recursively read each $subfile 
    319     print $outhandle "DirectoryPlugin block recurring: $subfile\n" if ($verbosity > 2); 
     318    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 
     319 
     320    # Recursively read each $raw_subfile 
     321    print $outhandle "DirectoryPlugin block recurring: $raw_file_subfile\n" if ($verbosity > 2); 
    320322     
    321323    #$count += &plugin::file_block_read ($pluginfo, $this_file_base_dir, 
     324 
    322325    &plugin::file_block_read ($pluginfo, $this_file_base_dir, 
    323                   &util::filename_cat($file, $subfile), 
     326                  $raw_file_subfile, 
    324327                  $block_hash, $metadata, $gli); 
    325328     
     
    390393    } 
    391394    @dir = readdir (DIR); 
     395    map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dir; 
    392396    closedir (DIR); 
    393397 
     
    436440    for (my $i = 0; $i < scalar(@dir); $i++) { 
    437441    my $subfile = $dir[$i]; 
     442    next if ($subfile =~ m/^\.\.?$/); 
     443 
    438444    my $this_file_base_dir = $base_dir; 
    439     next if ($subfile =~ m/^\.\.?$/); 
    440     my $file_subfile = &util::filename_cat($file, $subfile); 
    441     my $full_filename = &util::filename_cat($this_file_base_dir, $file_subfile); 
    442     if ($self->file_is_blocked($block_hash,$full_filename)) { 
    443         print STDERR "DirectoryPlugin: file $full_filename was blocked for metadata_read\n" if ($verbosity > 2); 
     445    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile); 
     446 
     447    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 
     448    my $raw_full_filename = &util::filename_cat($this_file_base_dir, $raw_file_subfile); 
     449 
     450    if ($self->file_is_blocked($block_hash,$raw_full_filename)) { 
     451        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for metadata_read\n" if ($verbosity > 2); 
    444452        next; 
    445453    } 
    446454     
    447     # Recursively read each $subfile 
    448     print $outhandle "DirectoryPlugin metadata recurring: $subfile\n" if ($verbosity > 2); 
     455    # Recursively read each $raw_subfile 
     456    print $outhandle "DirectoryPlugin metadata recurring: $raw_subfile\n" if ($verbosity > 2); 
    449457     
    450458    &plugin::metadata_read ($pluginfo, $this_file_base_dir, 
    451                 $file_subfile,$block_hash, 
     459                $raw_file_subfile,$block_hash, 
    452460                \@extrametakeys, \%extrametadata, 
    453461                \%extrametafile, 
     
    497505        last if (!opendir (DIR, $dirname)); 
    498506        my @dirnow = readdir (DIR); 
     507        map { $_ = &unicode::raw_filename_to_url_encoded($_) } @dirnow; 
    499508        closedir (DIR); 
    500509 
     
    518527 
    519528    my $subfile = $dir[$i]; 
    520     my $this_file_base_dir = $base_dir; 
    521529    last if ($maxdocs != -1 && ($count + $total_count) >= $maxdocs); 
    522530    next if ($subfile =~ /^\.\.?$/); 
    523531 
    524     my $file_subfile = &util::filename_cat($file, $subfile); 
    525     my $full_filename  
    526         = &util::filename_cat($this_file_base_dir,$file_subfile); 
    527  
    528     if ($self->file_is_blocked($block_hash,$full_filename)) { 
    529         print STDERR "DirectoryPlugin: file $full_filename was blocked for read\n" if ($verbosity > 2); 
     532    my $this_file_base_dir = $base_dir; 
     533    my $raw_subfile = &unicode::url_encoded_to_raw_filename($subfile); 
     534 
     535    my $raw_file_subfile = &util::filename_cat($file, $raw_subfile); 
     536    my $raw_full_filename  
     537        = &util::filename_cat($this_file_base_dir,$raw_file_subfile); 
     538 
     539    if ($self->file_is_blocked($block_hash,$raw_full_filename)) { 
     540        print STDERR "DirectoryPlugin: file $raw_full_filename was blocked for read\n" if ($verbosity > 2); 
    530541        next; 
    531542    } 
    532     #print STDERR "processing $full_filename\n"; 
     543    #print STDERR "processing $raw_full_filename\n"; 
    533544    # Follow Windows shortcuts 
    534     if ($subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) { 
     545    if ($raw_subfile =~ /(?i)\.lnk$/ && $ENV{'GSDLOS'} =~ /^windows$/i) { 
    535546        require Win32::Shortcut; 
    536         my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $subfile)); 
     547        my $shortcut = new Win32::Shortcut(&util::filename_cat($dirname, $raw_subfile)); 
    537548        if ($shortcut) { 
    538549        # The file to be processed is now the target of the shortcut 
    539550        $this_file_base_dir = ""; 
    540551        $file = ""; 
    541         $subfile = $shortcut->Path; 
     552        $raw_subfile = $shortcut->Path; 
    542553        } 
    543554    } 
    544555 
    545556    # check for a symlink pointing back to a leading directory 
    546     if (-d "$dirname/$subfile" && -l "$dirname/$subfile") { 
     557    if (-d "$dirname/$raw_subfile" && -l "$dirname/$raw_subfile") { 
    547558        # readlink gives a "fatal error" on systems that don't implement 
    548559        # symlinks. This assumes the the -l test above would fail on those. 
    549         my $linkdest=readlink "$dirname/$subfile"; 
     560        my $linkdest=readlink "$dirname/$raw_subfile"; 
    550561        if (!defined ($linkdest)) { 
    551562        # system error - file not found? 
     
    555566        if ($linkdest =~ m@^[\./\\]+$@ || 
    556567            index($dirname, $linkdest) != -1) { 
    557             warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$subfile -> $linkdest)\n"; 
     568            warn "DirectoryPlugin: Ignoring recursive symlink ($dirname/$raw_subfile -> $linkdest)\n"; 
    558569            next; 
    559570            ; 
     
    562573    } 
    563574 
    564     print $outhandle "DirectoryPlugin: preparing metadata for $subfile\n" if ($verbosity > 2); 
    565  
    566     # Make a copy of $in_metadata to pass to $subfile 
     575    print $outhandle "DirectoryPlugin: preparing metadata for $raw_subfile\n" if ($verbosity > 2); 
     576 
     577    # Make a copy of $in_metadata to pass to $raw_subfile 
    567578    my $out_metadata = {}; 
    568579    &metadatautil::combine_metadata_structures($out_metadata, $in_metadata); 
    569580 
    570581    # check the assocfile_info 
    571     if (defined $self->{'assocfile_info'}->{$full_filename}) { 
    572         &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$full_filename}); 
    573     } 
    574         ## encode the filename as perl5 doesn't handle unicode filenames     
    575         
    576         my $tmpfile = Encode::encode_utf8($subfile);  
     582    if (defined $self->{'assocfile_info'}->{$raw_full_filename}) { 
     583        &metadatautil::combine_metadata_structures($out_metadata, $self->{'assocfile_info'}->{$raw_full_filename}); 
     584    } 
     585 
     586    # $subfile by this point is url-encoded => all ASCII chars => no need to encode as UTF8 
     587 
    577588    # Next add metadata read in XML files (if it is supplied) 
    578589    if ($additionalmetadata == 1) { 
    579590        foreach my $filespec (@extrametakeys) { 
    580         ## use the utf8 encoded filename to do the filename comparison 
    581         if ($tmpfile =~ /^$filespec$/) { 
     591        ## use the url-encoded filename to do the filename comparison 
     592 
     593        if ($subfile =~ /^$filespec$/) { 
    582594            print $outhandle "File \"$subfile\" matches filespec \"$filespec\"\n"  
    583595            if ($verbosity > 2); 
     
    605617        # Look to see if it's a completely new file 
    606618 
    607         if (!$block_hash->{'new_files'}->{$full_filename}) { 
     619        if (!$block_hash->{'new_files'}->{$raw_full_filename}) { 
    608620        # Not a new file, must be an existing file 
    609621        # Let' see if it's newer than the last import.pl 
    610622 
    611623 
    612         if (! -d $full_filename) { 
    613             if (!$block_hash->{'reindex_files'}->{$full_filename}) { 
     624        if (! -d $raw_full_filename) { 
     625            if (!$block_hash->{'reindex_files'}->{$raw_full_filename}) { 
    614626            # filename has been around for longer than inf_timestamp 
    615627            print $outhandle "**** Skipping $subfile\n" if ($verbosity >3); 
     
    634646     
    635647    $count += &plugin::read ($pluginfo, $this_file_base_dir, 
    636                  $file_subfile, $block_hash, 
     648                 $raw_file_subfile, $block_hash, 
    637649                 $out_metadata, $processor, $maxdocs, ($total_count + $count), $gli); 
    638650    } 
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r22951 r23335  
    181181    my @file_blocks; 
    182182     
    183     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
     183    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
     184    $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding; 
    184185 
    185186    # read in file ($text will be in utf8) 
     
    229230    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 
    230231    my $utf8_link = ""; 
    231     $self->decode_text($link,$encoding,$language,\$utf8_link); 
     232    $self->decode_text($link,$content_encoding,$language,\$utf8_link); 
    232233 
    233234    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 
     
    272273         
    273274    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
     275 
     276    # Lookup content_encoding worked out in file_block pass for this file 
     277    # Store it under the local name 'content_encoding' so its nice and  
     278    # easy to access 
     279    $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}; 
     280 
    274281    # get the input file 
    275282    my $input_filename = $file; 
     
    311318    $self->set_Source_metadata($doc_obj, $filename_no_path); 
    312319    } 
     320 
     321    delete $self->{'store_content_encoding'}->{$filename_full_path}; 
     322    $self->{'content_encoding'} = undef; 
     323 
    313324    return ($process_status,$doc_obj); 
    314325} 
     
    367378    # links, so even if 'file_is_url' is off, still need to store info 
    368379 
    369     my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$"); 
    370     my $utf8_file = $self->filename_to_utf8_metadata($file); 
    371     $utf8_file =~ s/&\#095;/_/g; 
     380    my ($tailname,$dirname) = &File::Basename::fileparse($file); 
     381    print STDERR "***!! file = $file\n"; 
     382#    my $utf8_file = $self->filename_to_utf8_metadata($file); 
     383#    $utf8_file =~ s/&\#095;/_/g; 
     384    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 
     385    print STDERR "***!! utf8_file = $utf8_file\n"; 
     386 
    372387    my $web_url = "http://"; 
    373388    if(defined $dirname) { # local directory 
     
    535550    $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 
    536551    } 
     552 
    537553    return 1; 
    538554} 
     
    737753    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is; 
    738754 
    739  
    740755    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||  
    741756    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 
     757 
     758 
     759    # If web page didn't give encoding, then default to utf8 
     760    print "*************** looking up $file\n"; 
     761 
     762    my $content_encoding= $self->{'content_encoding'} || "utf8"; 
     763    $href = encode($content_encoding,$href); 
     764 
     765    $href = &unicode::raw_filename_to_url_encoded($href); 
     766    $href = &unicode::filename_to_url($href); 
     767 
    742768    &ghtml::urlsafe ($href); 
     769    print STDERR "***!!! href=$href\n";     
     770 
    743771    return $front . "_httpextlink_&amp;rl=" . $rl . "&amp;href=" . $href . $hash_part . $back; 
    744772    } else { 
    745     # link is to some other type of file (eg image) so we'll 
     773    # link is to some other type of file (e.g., an image) so we'll 
    746774    # need to associate that file 
    747775    return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back; 
     
    769797    $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/; 
    770798    } 
    771     # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some 
    772     # situations. If the *original* file's name was in URL encoding, the following method will not decode it. 
     799 
     800    # Replace %XX's in URL with decoded value if required. Note that the 
     801    # filename may include the %XX in some situations. If the *original* 
     802    # file's name was in URL encoding, the following method will not decode 
     803    # it. 
    773804    my $utf8_filename = $filename; 
    774     $filename = $self->opt_url_decode($utf8_filename); 
     805    my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename); 
     806 
     807    my $content_encoding= $self->{'content_encoding'} || "utf8"; 
     808 
     809    # The filenames that come through the HTML file have been decoded 
     810    # into Unicode aware Perl strings.  Need to convert them back 
     811    # to their initial raw-byte encoding to match the file that 
     812    # exists on the file system 
     813    $filename = encode($content_encoding, $opt_decode_utf8_filename); 
     814 
    775815 
    776816    # some special processing if the intended filename was converted to utf8, but 
     
    778818    if (!-e $filename) { 
    779819    # try the original filename stored in map 
    780     my $original_filename = $self->{'utf8_to_original_filename'}->{$filename}; 
     820    print STDERR "***###!! orig filename did not exist: $filename\n"; 
     821 
     822    my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename}; 
     823 
     824    print STDERR "**** Trying for $original_filename\n"; 
     825 
    781826    if (defined $original_filename && -e $original_filename) { 
     827        print STDERR "*** found match\n"; 
    782828        $filename = $original_filename; 
    783829    } 
     
    11181164    $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); 
    11191165    $title =~ s/^\s+//s; # in case title_sub introduced any... 
    1120     $doc_obj->add_utf8_metadata ($section, 'Title', $title); 
     1166    print STDERR "**** adding Title: ", Encode::encode("utf8",$title), "\n"; 
     1167    $doc_obj->add_utf8_metadata ($section, "Title", $title); 
    11211168    print $outhandle " extracted Title metadata \"$title\" from $from\n"  
    11221169        if ($self->{'verbosity'} > 2); 
  • main/trunk/greenstone2/perllib/plugins/ImageConverter.pm

    r22663 r23335  
    152152sub generate_images { 
    153153    my $self = shift(@_); 
    154     my ($filename_full_path, $filename_no_path, $doc_obj, $section) = @_; 
     154    my ($filename_full_path, $filename_no_path, $doc_obj, $section, $filename_encoding) = @_; 
    155155 
    156156    # check image magick status 
     
    196196 
    197197    if ($converttotype ne "" && $filename_full_path !~ m/$converttotype$/) { 
    198     #    $doc_obj->add_utf8_metadata($section, "Image", $utf8_filename_meta); 
     198#   #    $doc_obj->add_utf8_metadata($section, "Image", $utf8_filename_meta); 
    199199 
    200200    my ($result, $converted_filename_full_path) 
     
    214214 
    215215    # here we overwrite the original with the potentially converted one 
    216     $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image 
    217     $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image 
     216#    $doc_obj->set_utf8_metadata_element($section, "Source", &unicode::url_decode($filename_no_path)); # displayname of generated image 
     217#    $doc_obj->set_utf8_metadata_element($section, "SourceFile", $url_to_filename_no_path); # displayname of generated image 
     218 
     219#    $self->set_Source_metadata($doc_obj,$url_to_filename_no_path,undef); 
     220 
     221    $self->set_Source_metadata($doc_obj,&unicode::url_decode($filename_no_path), 
     222                   $filename_encoding); 
     223 
    218224 
    219225    # use identify to get info about the (possibly converted) image 
  • main/trunk/greenstone2/perllib/plugins/ImagePlugin.pm

    r22663 r23335  
    110110    my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 
    111111 
    112  
    113112    my $outhandle = $self->{'outhandle'}; 
    114113    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
     
    116115    if ($self->{'image_conversion_available'} == 1) 
    117116    { 
    118     my $utf8_filename_no_path = $self->filepath_to_utf8($filename_no_path); 
    119     my $url_encoded_filename = &util::rename_file($utf8_filename_no_path, $self->{'file_rename_method'}); 
    120     $self->generate_images($filename_full_path, $url_encoded_filename, $doc_obj, $doc_obj->get_top_section()); # should we check the return value? 
     117    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata); 
     118 
     119#   my $utf8_filename_no_path = $self->filepath_to_utf8($filename_no_path); 
     120#   my $url_encoded_filename = &util::rename_file($utf8_filename_no_path, $self->{'file_rename_method'}); 
     121 
     122#   $self->generate_images($filename_full_path, $url_encoded_filename,  
     123#                  $doc_obj, $doc_obj->get_top_section()); # should we check the return value? 
     124 
     125    $filename_no_path = &unicode::raw_filename_to_url_encoded($filename_no_path); 
     126 
     127    # should we check the return value? 
     128    $self->generate_images($filename_full_path, $filename_no_path,  
     129                   $doc_obj, $doc_obj->get_top_section(),$filename_encoding);  
     130 
    121131    } 
    122132    else 
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r22857 r23335  
    128128 
    129129    # Do encoding stuff 
    130     my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
     130    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
    131131    if ($self->{'verbosity'} > 2) { 
    132     print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n"; 
     132    print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n"; 
    133133    } 
    134134 
     
    140140    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 
    141141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 
    142     $self->set_Source_metadata($doc_obj, $filename_no_path, $encoding); 
     142 
     143    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata); 
     144    $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding); 
    143145 
    144146    $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
    145     $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 
     147    $doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding); 
    146148     
    147149    # read in file ($text will be in utf8) 
    148150    my $text = ""; 
    149     $self->read_file ($filename_full_path, $encoding, $language, \$text); 
     151    $self->read_file ($filename_full_path, $content_encoding, $language, \$text); 
    150152 
    151153    if (!length ($text)) {