Changeset 23352


Ignore:
Timestamp:
2010-11-28T23:24:22+13:00 (13 years ago)
Author:
davidb
Message:

Modifications to code to support filename encoding issues when tested under Windows

Location:
main/trunk/greenstone2/perllib/plugins
Files:
15 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23347 r23352  
    565565
    566566    elsif ($filename_encoding eq "auto-filesystem-encoding")
    567     {
     567    {   
    568568    # try locale
    569569    $filename_encoding = $self->locale_encoding();
     
    740740
    741741# uses locale
    742 sub get_filesystem_encoding {
     742sub get_filesystem_encoding
     743{
    743744
    744745    my $self = shift(@_);
     
    748749
    749750    eval {
     751    # Works for Windows as well, returning the DOS code page in use
    750752    use POSIX qw(locale_h);
    751753   
     
    788790   
    789791    }
     792
    790793    return $filesystem_encoding;
    791794}
     
    840843   
    841844    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
    842         # See if we can determine the file system encoding through locale
    843         $deduced_filename_encoding = $self->locale_encoding();
    844 
    845         # if locale shows us filesystem is utf8, check to see filename is consistent
    846         # => if not, then we have an "alien" filename on our hands
    847 
    848         if ($deduced_filename_encoding =~ m/^utf-?8$/i) {
    849             if (!&unicode::check_is_utf8($file)) {
    850                 # "alien" filename, so revert
    851                 $deduced_filename_encoding = undef;
    852             }
     845
     846    # Look to file system to provide a character encoding
     847
     848    # If Windows NTFS, then -- assuming we work with long file names got through
     849    # Win32::GetLongFilePath() -- then the underlying file system is UTF16
     850
     851    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     852        # Can do better than working with the DOS character encoding returned by locale     
     853        $deduced_filename_encoding = "unicode";
     854    }
     855    else {
     856        # Unix of some form or other
     857
     858        # See if we can determine the file system encoding through locale
     859        $deduced_filename_encoding = $self->locale_encoding();
     860   
     861        # if locale shows us filesystem is utf8, check to see filename is consistent
     862        # => if not, then we have an "alien" filename on our hands
     863       
     864        if ($deduced_filename_encoding =~ m/^utf-?8$/i) {
     865        if (!&unicode::check_is_utf8($file)) {
     866            # "alien" filename, so revert
     867            $deduced_filename_encoding = undef;
    853868        }
    854     }
    855    
     869        }
     870    }
     871    }
    856872   
    857873#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) {
     
    884900sub set_Source_metadata {
    885901    my $self = shift (@_); 
    886     my ($doc_obj, $raw_file, $filename_encoding) = @_;
     902    my ($doc_obj, $raw_filename, $filename_encoding) = @_;
    887903
    888904    # 1. Sets the filename (Source) for display encoded as Unicode if possible,
     
    890906    # 2. Sets the url ref (SourceFile) to the URL encoded version
    891907    #    of filename for generated files
     908   
     909    my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename);
    892910
    893911    my $top_section = $doc_obj->get_top_section();
    894    
     912
     913    my $octet_file = $raw_file;
     914
    895915    # UTF-8 version of filename
    896     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    897         print STDERR "****** Setting Source Metadata given: $raw_file\n";
    898     }
     916    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     917    print STDERR "****** Setting Source Metadata given: $octet_file\n";
     918    }
     919   
     920    # Deal with (on Windows) raw filenames that are in their
     921    # abbreviated DOS form
     922
     923    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     924    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
     925        if (-e $raw_filename) {
     926        require Win32;
     927       
     928##      print STDERR "**** raw filename before LPN: $raw_filename\n";
     929        my $unicode_filename = Win32::GetLongPathName($raw_filename);
     930       
     931        my $unused_full_uf;
     932        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
     933
     934##      print STDERR "**** raw filename after LPN: $raw_filename\n";       
     935        }
     936    }
     937    }
    899938
    900939    my $url_encoded_filename;
    901940    if (defined $filename_encoding) {
    902         # => Generate a pretty print version of filename that is mapped to Unicode
    903        
    904         # Use filename_encoding to map raw filename to a Perl unicode-aware string
    905         $url_encoded_filename = decode($filename_encoding,$raw_file);       
     941    # => Generate a pretty print version of filename that is mapped to Unicode
     942   
     943    # Use filename_encoding to map raw filename to a Perl unicode-aware string
     944    $url_encoded_filename = decode($filename_encoding,$octet_file);     
    906945    }
    907946    else {
    908         # otherwise generate %xx encoded version of filename for char > 127
    909         $url_encoded_filename = &unicode::raw_filename_to_url_encoded($raw_file);
    910     }
    911    
    912     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    913         print STDERR "***** saving Source as:             $url_encoded_filename\n";
    914     }
    915 
     947    # otherwise generate %xx encoded version of filename for char > 127
     948    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file);
     949    }
     950   
     951    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     952    print STDERR "***** saving Source as:             $url_encoded_filename\n";
     953    }
     954   
    916955   
    917956    # Source is the UTF8 display name - not necessarily the name of the file on the system
    918957    $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename);
    919 
     958   
    920959    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'});
    921960    # If using URL encoding, then SourceFile is the url-reference to url-encoded
     
    926965                    $renamed_raw_url);
    927966
    928     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    929         print STDERR "***** saving SourceFile as:         $renamed_raw_url\n";
    930     }
     967    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     968    print STDERR "***** saving SourceFile as:         $renamed_raw_url\n";
     969    }
    931970}
    932971   
     
    9881027 
    9891028
    990     my $plugin_filename_encoding = $self->{'filename_encoding'};
     1029    my $plugin_filename_encoding = $self->{'filename_encoding'};
    9911030    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    992     $self->set_Source_metadata($doc_obj,$filename_no_path,$filename_encoding);
     1031    $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$filename_full_path);
    9931032
    9941033    # plugin specific stuff - what args do we need here??
  • main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm

    r23349 r23352  
    661661    my $plugin_filename_encoding = $self->{'filename_encoding'};
    662662    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    663     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding);
     663    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    664664    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    665665    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path));
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r23349 r23352  
    399399    my $plugin_filename_encoding = $self->{'filename_encoding'};
    400400    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    401     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
     401    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    402402       
    403403    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
  • main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm

    r23349 r23352  
    347347    my $plugin_filename_encoding = $self->{'filename_encoding'};
    348348    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    349     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding);
     349    $self->set_Source_metadata($doc_obj, $conv_filename, $filename_encoding);
    350350   
    351351    if ($self->{'cover_image'}) {
  • main/trunk/greenstone2/perllib/plugins/DatabasePlugin.pm

    r23349 r23352  
    272272
    273273    my $plugin_filename_encoding = $self->{'filename_encoding'};
    274     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    275     $self->set_Source_metadata($doc_obj, $db, $filename_encoding);
     274    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
     275    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    276276
    277277    if ($self->{'cover_image'}) {
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23349 r23352  
    318318
    319319    my $plugin_filename_encoding = $self->{'filename_encoding'};
    320     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    321     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
     320    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
     321    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    322322    }
    323323
  • main/trunk/greenstone2/perllib/plugins/ImageConverter.pm

    r23349 r23352  
    152152sub generate_images {
    153153    my $self = shift(@_);
    154     my ($filename_full_path, $filename_no_path, $doc_obj, $section, $filename_encoding) = @_;
     154    my ($filename_full_path, $filename_encoded_full_path, $doc_obj, $section, $filename_encoding) = @_;
     155
     156    my ($unused_fefp,$filename_encoded_no_path)
     157    = util::get_full_filenames("",$filename_encoded_full_path);
     158
     159    # The following is potentially very muddled thinking (but currently seems to work)
     160    # generate_images currently called from ImagePlugin and PagedImagePlugin
     161    my $filename_no_path = $filename_encoded_no_path;
    155162
    156163    # check image magick status
     
    219226#    $self->set_Source_metadata($doc_obj,$url_to_filename_no_path,undef);
    220227
    221     $self->set_Source_metadata($doc_obj,&unicode::url_decode($filename_no_path),
    222                                $filename_encoding);
     228    my $raw_filename_full_path = &unicode::url_decode($filename_encoded_full_path);
     229    $self->set_Source_metadata($doc_obj,$raw_filename_full_path,
     230                   $filename_encoding);
    223231
    224232
  • main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm

    r23349 r23352  
    227227    my $plugin_filename_encoding = $self->{'filename_encoding'};
    228228    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    229     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding);
     229    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
    230230
    231231    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}");
  • main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm

    r23349 r23352  
    297297   
    298298    my ($filemeta) = $file =~ /([^\\\/]+)$/;
    299     my $plugin_filename_encoding = $self->{'filename_encoding'};
     299    my $plugin_filename_encoding = $self->{'filename_encoding'};
    300300    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    301     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding);
     301    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
    302302
    303303    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
  • main/trunk/greenstone2/perllib/plugins/OpenDocumentPlugin.pm

    r23349 r23352  
    268268    my $filename_encoding = $self->deduce_filename_encoding($file_only,$metadata,$plugin_filename_encoding);
    269269
    270     $self->set_Source_metadata($doc_obj, $file_only, $filename_encoding);
    271      $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename));
     270    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
     271    $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename));
    272272     
    273273    # include any metadata passed in from previous plugins
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r23349 r23352  
    426426    my $result = 0;
    427427    if ($self->{'image_conversion_available'} == 1) {
    428     # do we need to convert $filename_no_path to utf8? We are already reading in from a file, what encoding is it in???
    429     $result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section);
     428    # do we need to convert $filename_no_path to utf8/url encoded?
     429    # We are already reading in from a file, what encoding is it in???
     430    my $url_encoded_full_filename
     431        = &unicode::raw_filename_to_url_encoded($filename_full_path);
     432    $result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section);
    430433    }
    431434    #overwrite one set in ImageConverter
     
    513516    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
    514517    # TODO is file filenmae_no_path??
    515     $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file'}, $self->{'processor'}, $self->{'metadata'});
     518    $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'});
    516519
    517520    my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/;
     
    540543sub set_initial_doc_fields {
    541544    my $self = shift(@_);
    542     my ($doc_obj, $filename_no_path, $processor, $metadata) = @_;
     545    my ($doc_obj, $filename_full_path, $processor, $metadata) = @_;
    543546
    544547    my $topsection = $doc_obj->get_top_section();
     
    552555    }
    553556
    554     my $plugin_filename_encoding = $self->{'filename_encoding'};
    555     my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding);
    556     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
     557    my $plugin_filename_encoding = $self->{'filename_encoding'};
     558    my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding);
     559    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    557560   
    558561    # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible
     
    620623
    621624    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
    622     $self->set_initial_doc_fields($doc_obj, $filename_no_path, $processor, $metadata);
     625    $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata);
    623626    my $topsection = $doc_obj->get_top_section();
    624627    open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path\n";
  • main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

    r23349 r23352  
    346346    my $plugin_filename_encoding = $self->{'filename_encoding'};
    347347    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    348     $self->set_Source_metadata($doc_obj, $filename_no_path,$filename_encoding);
     348    $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding);
    349349       
    350350    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r23348 r23352  
    141141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
    142142
    143     my $plugin_filename_encoding = $self->{'filename_encoding'};
     143    my $plugin_filename_encoding = $self->{'filename_encoding'};
    144144    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    145     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
     145    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    146146
    147147    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
  • main/trunk/greenstone2/perllib/plugins/ReadXMLFile.pm

    r23349 r23352  
    369369    my $self = shift(@_);
    370370
    371     my $metadata = $self->{'metadata'};
     371    my $metadata = $self->{'metadata'};
     372    my $filename_full_path = $self->{'filename'};
    372373
    373374    # create a new document
    374     my $doc_obj = $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});
     375    my $doc_obj = $self->{'doc_obj'} = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
    375376
    376377    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    377378
    378     my $filename_no_path = $self->{'filename_no_path'};
    379     my $plugin_filename_encoding = $self->{'filename_encoding'};
     379    my $filename_no_path = $self->{'filename_no_path'};
     380    my $plugin_filename_encoding = $self->{'filename_encoding'};
    380381    my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding);
    381382
    382     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
     383    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    383384   
    384385    # do we want other auto metadata here (see BasePlugin.read_into_doc_obj)
  • main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm

    r23349 r23352  
    243243    my $plugin_filename_encoding = $self->{'filename_encoding'};
    244244    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
    245     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding);
     245    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
    246246
    247247    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
Note: See TracChangeset for help on using the changeset viewer.