Ignore:
Timestamp:
2010-11-19T13:29:29+13:00 (13 years ago)
Author:
davidb
Message:

Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23279 r23335  
    3131
    3232use File::Basename;
     33use Encode;
    3334
    3435use encodings;
     
    785786}
    786787
    787 # is there ever only one Source? Sometimes this will be called twice, for images etc that are converted.
     788
     789
     790sub deduce_filename_encoding
     791{
     792    my $self = shift (@_); 
     793    my ($file,$metadata) = @_;
     794
     795    my $gs_filename_encoding = $metadata->{"gs.filename_encoding"};
     796    my $deduced_filename_encoding = undef;
     797   
     798    # Start by looking for manually assigned metadata
     799    if (defined $gs_filename_encoding) {
     800    if (ref ($gs_filename_encoding) eq "ARRAY") {
     801        my $outhandle = $self->{'outhandle'};
     802       
     803        $deduced_filename_encoding = $gs_filename_encoding->[0];
     804       
     805        my $num_vals = scalar(@$gs_filename_encoding);
     806        if ($num_vals>1) {
     807        print $outhandle "Warning: gs.filename_encoding multiply defined for $file\n";
     808        print $outhandle "         Selecting first value: $deduced_filename_encoding\n";
     809        }
     810    }
     811    else {
     812        $deduced_filename_encoding = $gs_filename_encoding;
     813    }
     814    }
     815   
     816#   binmode(STDERR,":utf8");
     817   
     818#   print STDERR "**** file = $file\n";
     819#   print STDERR "**** debug file = ", &unicode::debug_unicode_string($file),"\n";;
     820   
     821#   print STDERR "******* dfe = $deduced_filename_encoding\n";
     822   
     823    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     824    # Look to see if plugin specifies this value
     825    }
     826   
     827    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     828    # See if we can determine the file system encoding through locale
     829    # Unix only ?
     830   
     831    # if locale shows us filesystem is utf8, check to see filename is consistent
     832    # => if not, then we have an "alien" filename on our hands
     833    }
     834   
     835   
     836    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     837    # Last chance, apply textcat to deduce filename encoding
     838    }
     839   
     840    return $deduced_filename_encoding;
     841}
     842
     843
     844
     845
     846# Notionally written to be called once for each document, it is however safe to
     847# call multiple times (as in the case of ImagePlugin) which calls this later on
     848# after the original image has potentially been converted to a *new* source image
     849# format (e.g. TIFF to PNG)
     850
    788851sub set_Source_metadata {
    789852    my $self = shift (@_); 
    790     my ($doc_obj, $filename_no_path, $file_encoding) = @_;
     853    my ($doc_obj, $raw_file, $filename_encoding) = @_;
     854
     855    # 1. Sets the filename (Source) for display encoded as Unicode if possible,
     856    #    and (as a fallback) using %xx if not for non-ascii chars
     857    # 2. Sets the url ref (SourceFile) to the URL encoded version
     858    #    of filename for generated files
    791859
    792860    my $top_section = $doc_obj->get_top_section();
    793861   
    794862    # UTF-8 version of filename
    795     my $filemeta = $self->filename_to_utf8_metadata($filename_no_path, $file_encoding);
     863    print STDERR "**** setting Source Metadata given: $raw_file\n";
     864
     865##    my $filemeta = $self->filename_to_utf8_metadata($raw_file, $filename_encoding);
     866
     867    my $url_encoded_filename;
     868    if (defined $filename_encoding) {
     869    # => Generate a pretty print version of filename that is mapped to Unicode
     870
     871    # Use filename_encoding to map raw filename to a Perl unicode-aware string
     872    $url_encoded_filename = decode($filename_encoding,$raw_file);
     873
     874    print STDERR "@@@@ pretty print using $filename_encoding: ", encode("utf8",$url_encoded_filename),"\n";
     875    }
     876    else {
     877    # otherwise generate %xx encoded version of filename for char > 127
     878    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($raw_file);
     879    }
     880
     881    print STDERR "***** saving Source as:             $url_encoded_filename\n";
     882
    796883   
    797884    # Source is the UTF8 display name - not necessarily the name of the file on the system
    798     $doc_obj->set_utf8_metadata_element($top_section, "Source", $filemeta);
    799 
    800     $filemeta = &util::rename_file($filemeta, $self->{'file_rename_method'});
     885    $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename);
     886
     887    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
    801888    # If using URL encoding, then SourceFile is the url-reference to url-encoded
    802     # filemeta: it's a url that refers to the actual file on the system
    803     $filemeta = &unicode::filename_to_url($filemeta);
    804 
    805     $doc_obj->set_utf8_metadata_element($top_section, "SourceFile", $filemeta);
     889    # renamed_raw_url: it's a url that refers to the actual file on the system
     890    my $renamed_raw_url = &unicode::filename_to_url($renamed_raw_file);
     891
     892    $doc_obj->set_utf8_metadata_element($top_section, "SourceFile",
     893                    $renamed_raw_url);
     894
     895    print STDERR "***** saving SourceFile as:         $renamed_raw_url\n";
    806896}
    807897   
     
    863953 
    864954
    865     # sets the UTF8 filename (Source) for display and sets the url ref to URL encoded version
    866     # of the UTF8 filename (SourceFile) for generated files
    867     $self->set_Source_metadata($doc_obj, $filename_no_path);
    868 
     955    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata);
     956    $self->set_Source_metadata($doc_obj,$filename_no_path,$filename_encoding);
    869957
    870958    # plugin specific stuff - what args do we need here??
     
    10321120    my $file_derived_title = $file;
    10331121    $file_derived_title =~ s/_/ /g;
    1034     $file_derived_title =~ s/\.[^.]$//;
     1122    $file_derived_title =~ s/\.[^.]+$//;
    10351123
    10361124    return $file_derived_title;
     
    10441132
    10451133    if (!defined $doc_obj->get_metadata_element ($section, "Title")
    1046     or $doc_obj->get_metadata_element($section, "Title") eq "") {
    1047 
    1048     my $file_derived_title = $self->filename_to_utf8_metadata($self->filename_based_title($file));
     1134    || $doc_obj->get_metadata_element($section, "Title") eq "") {
     1135
     1136    my $source_file = $doc_obj->get_metadata_element($section, "Source");
     1137    my $file_derived_title;
     1138    if (defined $source_file) {
     1139        $file_derived_title =  $self->filename_based_title($source_file);
     1140    }
     1141    else {
     1142        # pp = pretty print
     1143        my $pp_file = (defined $source_file) ? $source_file : $file;
     1144
     1145        my $raw_title = $self->filename_based_title($file);
     1146        my $file_derived_title = &unicode::raw_filename_to_url_encoded($raw_title);
     1147    }
     1148
     1149
    10491150    if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
    10501151        $doc_obj->add_utf8_metadata ($section, "Title", $file_derived_title);
Note: See TracChangeset for help on using the changeset viewer.