Changeset 23352

Show
Ignore:
Timestamp:
28.11.2010 23:24:22 (9 years ago)
Author:
davidb
Message:

Modifications to code to support filename encoding issues when tested under Windows

Location:
main/trunk/greenstone2/perllib/plugins
Files:
15 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23347 r23352  
    565565 
    566566    elsif ($filename_encoding eq "auto-filesystem-encoding")  
    567     { 
     567    {    
    568568    # try locale 
    569569    $filename_encoding = $self->locale_encoding(); 
     
    740740 
    741741# uses locale 
    742 sub get_filesystem_encoding { 
     742sub get_filesystem_encoding  
     743{ 
    743744 
    744745    my $self = shift(@_); 
     
    748749 
    749750    eval { 
     751    # Works for Windows as well, returning the DOS code page in use  
    750752    use POSIX qw(locale_h); 
    751753     
     
    788790     
    789791    } 
     792 
    790793    return $filesystem_encoding; 
    791794} 
     
    840843     
    841844    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
    842         # See if we can determine the file system encoding through locale 
    843         $deduced_filename_encoding = $self->locale_encoding(); 
    844  
    845         # if locale shows us filesystem is utf8, check to see filename is consistent 
    846         # => if not, then we have an "alien" filename on our hands 
    847  
    848         if ($deduced_filename_encoding =~ m/^utf-?8$/i) { 
    849             if (!&unicode::check_is_utf8($file)) { 
    850                 # "alien" filename, so revert 
    851                 $deduced_filename_encoding = undef; 
    852             } 
     845 
     846    # Look to file system to provide a character encoding 
     847 
     848    # If Windows NTFS, then -- assuming we work with long file names got through 
     849    # Win32::GetLongFilePath() -- then the underlying file system is UTF16 
     850 
     851    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     852        # Can do better than working with the DOS character encoding returned by locale      
     853        $deduced_filename_encoding = "unicode"; 
     854    } 
     855    else { 
     856        # Unix of some form or other 
     857 
     858        # See if we can determine the file system encoding through locale 
     859        $deduced_filename_encoding = $self->locale_encoding(); 
     860     
     861        # if locale shows us filesystem is utf8, check to see filename is consistent 
     862        # => if not, then we have an "alien" filename on our hands 
     863         
     864        if ($deduced_filename_encoding =~ m/^utf-?8$/i) { 
     865        if (!&unicode::check_is_utf8($file)) { 
     866            # "alien" filename, so revert 
     867            $deduced_filename_encoding = undef; 
    853868        } 
    854     } 
    855      
     869        } 
     870    } 
     871    } 
    856872     
    857873#    if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 
     
    884900sub set_Source_metadata { 
    885901    my $self = shift (@_);   
    886     my ($doc_obj, $raw_file, $filename_encoding) = @_; 
     902    my ($doc_obj, $raw_filename, $filename_encoding) = @_; 
    887903 
    888904    # 1. Sets the filename (Source) for display encoded as Unicode if possible, 
     
    890906    # 2. Sets the url ref (SourceFile) to the URL encoded version 
    891907    #    of filename for generated files 
     908     
     909    my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename); 
    892910 
    893911    my $top_section = $doc_obj->get_top_section(); 
    894      
     912 
     913    my $octet_file = $raw_file; 
     914 
    895915    # UTF-8 version of filename 
    896     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    897         print STDERR "****** Setting Source Metadata given: $raw_file\n"; 
    898     } 
     916    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     917    print STDERR "****** Setting Source Metadata given: $octet_file\n"; 
     918    } 
     919     
     920    # Deal with (on Windows) raw filenames that are in their 
     921    # abbreviated DOS form 
     922 
     923    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     924    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) { 
     925        if (-e $raw_filename) { 
     926        require Win32; 
     927         
     928##      print STDERR "**** raw filename before LPN: $raw_filename\n"; 
     929        my $unicode_filename = Win32::GetLongPathName($raw_filename); 
     930         
     931        my $unused_full_uf; 
     932        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename); 
     933 
     934##      print STDERR "**** raw filename after LPN: $raw_filename\n";         
     935        } 
     936    } 
     937    } 
    899938 
    900939    my $url_encoded_filename; 
    901940    if (defined $filename_encoding) { 
    902         # => Generate a pretty print version of filename that is mapped to Unicode 
    903          
    904         # Use filename_encoding to map raw filename to a Perl unicode-aware string  
    905         $url_encoded_filename = decode($filename_encoding,$raw_file);        
     941    # => Generate a pretty print version of filename that is mapped to Unicode 
     942     
     943    # Use filename_encoding to map raw filename to a Perl unicode-aware string  
     944    $url_encoded_filename = decode($filename_encoding,$octet_file);      
    906945    } 
    907946    else { 
    908         # otherwise generate %xx encoded version of filename for char > 127 
    909         $url_encoded_filename = &unicode::raw_filename_to_url_encoded($raw_file); 
    910     } 
    911      
    912     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    913         print STDERR "***** saving Source as:             $url_encoded_filename\n"; 
    914     } 
    915  
     947    # otherwise generate %xx encoded version of filename for char > 127 
     948    $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file); 
     949    } 
     950     
     951    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     952    print STDERR "***** saving Source as:             $url_encoded_filename\n"; 
     953    } 
     954     
    916955     
    917956    # Source is the UTF8 display name - not necessarily the name of the file on the system 
    918957    $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename);  
    919  
     958     
    920959    my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'}); 
    921960    # If using URL encoding, then SourceFile is the url-reference to url-encoded 
     
    926965                    $renamed_raw_url); 
    927966 
    928     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    929         print STDERR "***** saving SourceFile as:         $renamed_raw_url\n"; 
    930     } 
     967    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     968    print STDERR "***** saving SourceFile as:         $renamed_raw_url\n"; 
     969    } 
    931970} 
    932971    
     
    9881027  
    9891028 
    990     my $plugin_filename_encoding = $self->{'filename_encoding'}; 
     1029    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    9911030    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    992     $self->set_Source_metadata($doc_obj,$filename_no_path,$filename_encoding); 
     1031    $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$filename_full_path); 
    9931032 
    9941033    # plugin specific stuff - what args do we need here?? 
  • main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm

    r23349 r23352  
    661661    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    662662    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    663     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding); 
     663    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    664664    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    665665    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); 
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r23349 r23352  
    399399    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    400400    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    401     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding); 
     401    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    402402         
    403403    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
  • main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm

    r23349 r23352  
    347347    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    348348    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    349     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding); 
     349    $self->set_Source_metadata($doc_obj, $conv_filename, $filename_encoding); 
    350350     
    351351    if ($self->{'cover_image'}) { 
  • main/trunk/greenstone2/perllib/plugins/DatabasePlugin.pm

    r23349 r23352  
    272272 
    273273    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    274     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    275     $self->set_Source_metadata($doc_obj, $db, $filename_encoding); 
     274    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
     275    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    276276 
    277277    if ($self->{'cover_image'}) { 
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23349 r23352  
    318318 
    319319    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    320     my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    321     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding); 
     320    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
     321    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    322322    } 
    323323 
  • main/trunk/greenstone2/perllib/plugins/ImageConverter.pm

    r23349 r23352  
    152152sub generate_images { 
    153153    my $self = shift(@_); 
    154     my ($filename_full_path, $filename_no_path, $doc_obj, $section, $filename_encoding) = @_; 
     154    my ($filename_full_path, $filename_encoded_full_path, $doc_obj, $section, $filename_encoding) = @_; 
     155 
     156    my ($unused_fefp,$filename_encoded_no_path) 
     157    = util::get_full_filenames("",$filename_encoded_full_path); 
     158 
     159    # The following is potentially very muddled thinking (but currently seems to work) 
     160    # generate_images currently called from ImagePlugin and PagedImagePlugin 
     161    my $filename_no_path = $filename_encoded_no_path;  
    155162 
    156163    # check image magick status 
     
    219226#    $self->set_Source_metadata($doc_obj,$url_to_filename_no_path,undef); 
    220227 
    221     $self->set_Source_metadata($doc_obj,&unicode::url_decode($filename_no_path), 
    222                                $filename_encoding); 
     228    my $raw_filename_full_path = &unicode::url_decode($filename_encoded_full_path); 
     229    $self->set_Source_metadata($doc_obj,$raw_filename_full_path, 
     230                   $filename_encoding); 
    223231 
    224232 
  • main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm

    r23349 r23352  
    227227    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    228228    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    229     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding); 
     229    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding); 
    230230 
    231231    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}"); 
  • main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm

    r23349 r23352  
    297297     
    298298    my ($filemeta) = $file =~ /([^\\\/]+)$/; 
    299     my $plugin_filename_encoding = $self->{'filename_encoding'}; 
     299    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    300300    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    301     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding); 
     301    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding); 
    302302 
    303303    $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
  • main/trunk/greenstone2/perllib/plugins/OpenDocumentPlugin.pm

    r23349 r23352  
    268268    my $filename_encoding = $self->deduce_filename_encoding($file_only,$metadata,$plugin_filename_encoding); 
    269269 
    270     $self->set_Source_metadata($doc_obj, $file_only, $filename_encoding); 
    271      $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename)); 
     270    $self->set_Source_metadata($doc_obj, $filename, $filename_encoding); 
     271    $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename)); 
    272272      
    273273    # include any metadata passed in from previous plugins  
  • main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm

    r23349 r23352  
    426426    my $result = 0; 
    427427    if ($self->{'image_conversion_available'} == 1) { 
    428     # do we need to convert $filename_no_path to utf8? We are already reading in from a file, what encoding is it in??? 
    429     $result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section); 
     428    # do we need to convert $filename_no_path to utf8/url encoded?  
     429    # We are already reading in from a file, what encoding is it in??? 
     430    my $url_encoded_full_filename  
     431        = &unicode::raw_filename_to_url_encoded($filename_full_path); 
     432    $result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section); 
    430433    } 
    431434    #overwrite one set in ImageConverter 
     
    513516    $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'}); 
    514517    # TODO is file filenmae_no_path?? 
    515     $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file'}, $self->{'processor'}, $self->{'metadata'}); 
     518    $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'}); 
    516519 
    517520    my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/; 
     
    540543sub set_initial_doc_fields { 
    541544    my $self = shift(@_); 
    542     my ($doc_obj, $filename_no_path, $processor, $metadata) = @_; 
     545    my ($doc_obj, $filename_full_path, $processor, $metadata) = @_; 
    543546 
    544547    my $topsection = $doc_obj->get_top_section(); 
     
    552555    } 
    553556 
    554     my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    555     my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding); 
    556     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding); 
     557    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
     558    my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding); 
     559    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    557560    
    558561    # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible 
     
    620623 
    621624    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 
    622     $self->set_initial_doc_fields($doc_obj, $filename_no_path, $processor, $metadata); 
     625    $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata); 
    623626    my $topsection = $doc_obj->get_top_section(); 
    624627    open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path\n"; 
  • main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm

    r23349 r23352  
    346346    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    347347    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    348     $self->set_Source_metadata($doc_obj, $filename_no_path,$filename_encoding); 
     348    $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding); 
    349349         
    350350    $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r23348 r23352  
    141141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 
    142142 
    143     my $plugin_filename_encoding = $self->{'filename_encoding'}; 
     143    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    144144    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    145     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding); 
     145    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    146146 
    147147    $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
  • main/trunk/greenstone2/perllib/plugins/ReadXMLFile.pm

    r23349 r23352  
    369369    my $self = shift(@_); 
    370370 
    371     my $metadata = $self->{'metadata'}; 
     371    my $metadata = $self->{'metadata'}; 
     372    my $filename_full_path = $self->{'filename'}; 
    372373 
    373374    # create a new document 
    374     my $doc_obj = $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'}); 
     375    my $doc_obj = $self->{'doc_obj'} = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 
    375376 
    376377    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    377378 
    378     my $filename_no_path = $self->{'filename_no_path'}; 
    379     my $plugin_filename_encoding = $self->{'filename_encoding'}; 
     379    my $filename_no_path = $self->{'filename_no_path'}; 
     380    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    380381    my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding); 
    381382 
    382     $self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding); 
     383    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    383384     
    384385    # do we want other auto metadata here (see BasePlugin.read_into_doc_obj) 
  • main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm

    r23349 r23352  
    243243    my $plugin_filename_encoding = $self->{'filename_encoding'}; 
    244244    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 
    245     $self->set_Source_metadata($doc_obj, $filemeta, $filename_encoding); 
     245    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 
    246246 
    247247    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");