Changeset 14961

Show
Ignore:
Timestamp:
12.02.2008 15:17:47 (11 years ago)
Author:
davidb
Message:

Setting filename metadata (Source) in BasPlug?.pm looks to user its locale, in the first instance, to resolve what character encoding the file system uses.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasPlug.pm

    r13968 r14961  
    797797 
    798798 
     799sub filename_to_metadata 
     800{ 
     801    my $self = shift (@_);   
     802    my ($file, $encoding) = @_; 
     803 
     804    my $outhandle = $self->{'outhandle'}; 
     805 
     806    my $filesystem_encoding = undef; 
     807 
     808    eval { 
     809    use POSIX qw(locale_h); 
     810 
     811    # With only one parameter, setlocale retrieves the current value 
     812    my $current_locale = setlocale(LC_CTYPE); 
     813 
     814    if ($current_locale =~ m/^.*\.(.*?)$/) { 
     815        my $char_encoding = lc($1); 
     816        $char_encoding =~ s/-/_/g; 
     817        $char_encoding =~ s/^utf_8$/utf8/; 
     818 
     819        if ($char_encoding =~ m/^\d+$/) { 
     820        if (defined $encodings::encoding->{"windows_$char_encoding"}) { 
     821            $char_encoding = "windows_$char_encoding"; 
     822        } 
     823        elsif (defined $encodings::encoding->{"dos_$char_encoding"}) { 
     824            $char_encoding = "dos_$char_encoding"; 
     825        } 
     826        } 
     827 
     828        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)  
     829        || (defined $encodings::encoding->{$char_encoding})) { 
     830        $filesystem_encoding = $char_encoding; 
     831        } 
     832        else { 
     833        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n"; 
     834        } 
     835    } 
     836 
     837 
     838    }; 
     839    if ($@) { 
     840    print $outhandle "$@\n"; 
     841    print $outhandle "Warning: Unable to establish locale.  Will assume filesytem is UTF-8\n"; 
     842     
     843    } 
     844     
     845    my ($filemeta) = $file =~ /([^\\\/]+)$/; 
     846     
     847    # how do we know what encoding the filename is in? 
     848    # => one answer is to check the locale 
     849 
     850    if (defined $filesystem_encoding) { 
     851    if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) { 
     852        $filemeta = unicode::unicode2utf8( 
     853            unicode::convert2unicode($filesystem_encoding, \$filemeta) 
     854              ); 
     855    } 
     856    } 
     857    # assume it is in the same encoding as its contents 
     858    elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) { 
     859    $filemeta = unicode::unicode2utf8( 
     860        unicode::convert2unicode($encoding, \$filemeta) 
     861    ); 
     862    } 
     863     
     864    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta); 
     865 
     866    return $dmsafe_filemeta; 
     867} 
     868 
    799869 
    800870# The BasPlug read_into_doc_obj() function. This function does all the 
     
    835905    # create a new document 
    836906    my $doc_obj = new doc ($filename, "indexed_doc"); 
     907    my $top_section = $doc_obj->get_top_section(); 
     908 
    837909    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'}); 
    838     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 
    839     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 
    840     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 
    841     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename)); 
    842  
    843     my ($filemeta) = $file =~ /([^\\\/]+)$/; 
    844     # how do we know what encoding the filename is in? 
    845     # assume it is in the same encoding as its contents 
    846     if ($encoding !~ /(?:ascii|utf8|unicode)/) { 
    847     $filemeta = unicode::unicode2utf8( 
    848         unicode::convert2unicode($encoding, \$filemeta) 
    849     ); 
    850     } 
    851     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 
     910    $doc_obj->add_utf8_metadata($top_section, "Language", $language); 
     911    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding); 
     912    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 
     913    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename)); 
     914 
     915    my $filemeta = $self->filename_to_metadata($file,$encoding); 
     916    $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta); 
    852917    if ($self->{'cover_image'}) { 
    853918    $self->associate_cover_image($doc_obj, $filename);