Changeset 14961


Ignore:
Timestamp:
2008-02-12T15:17:47+13:00 (14 years ago)
Author:
davidb
Message:

Setting filename metadata (Source) in BasPlug.pm looks to user its locale, in the first instance, to resolve what character encoding the file system uses.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/BasPlug.pm

    r13968 r14961  
    797797
    798798
     799sub filename_to_metadata
     800{
     801    my $self = shift (@_); 
     802    my ($file, $encoding) = @_;
     803
     804    my $outhandle = $self->{'outhandle'};
     805
     806    my $filesystem_encoding = undef;
     807
     808    eval {
     809    use POSIX qw(locale_h);
     810
     811    # With only one parameter, setlocale retrieves the current value
     812    my $current_locale = setlocale(LC_CTYPE);
     813
     814    if ($current_locale =~ m/^.*\.(.*?)$/) {
     815        my $char_encoding = lc($1);
     816        $char_encoding =~ s/-/_/g;
     817        $char_encoding =~ s/^utf_8$/utf8/;
     818
     819        if ($char_encoding =~ m/^\d+$/) {
     820        if (defined $encodings::encoding->{"windows_$char_encoding"}) {
     821            $char_encoding = "windows_$char_encoding";
     822        }
     823        elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
     824            $char_encoding = "dos_$char_encoding";
     825        }
     826        }
     827
     828        if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
     829        || (defined $encodings::encoding->{$char_encoding})) {
     830        $filesystem_encoding = $char_encoding;
     831        }
     832        else {
     833        print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
     834        }
     835    }
     836
     837
     838    };
     839    if ($@) {
     840    print $outhandle "$@\n";
     841    print $outhandle "Warning: Unable to establish locale.  Will assume filesytem is UTF-8\n";
     842   
     843    }
     844   
     845    my ($filemeta) = $file =~ /([^\\\/]+)$/;
     846   
     847    # how do we know what encoding the filename is in?
     848    # => one answer is to check the locale
     849
     850    if (defined $filesystem_encoding) {
     851    if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) {
     852        $filemeta = unicode::unicode2utf8(
     853            unicode::convert2unicode($filesystem_encoding, \$filemeta)
     854              );
     855    }
     856    }
     857    # assume it is in the same encoding as its contents
     858    elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) {
     859    $filemeta = unicode::unicode2utf8(
     860        unicode::convert2unicode($encoding, \$filemeta)
     861    );
     862    }
     863   
     864    my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
     865
     866    return $dmsafe_filemeta;
     867}
     868
    799869
    800870# The BasPlug read_into_doc_obj() function. This function does all the
     
    835905    # create a new document
    836906    my $doc_obj = new doc ($filename, "indexed_doc");
     907    my $top_section = $doc_obj->get_top_section();
     908
    837909    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
    838     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    839     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
    840     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
    841     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
    842 
    843     my ($filemeta) = $file =~ /([^\\\/]+)$/;
    844     # how do we know what encoding the filename is in?
    845     # assume it is in the same encoding as its contents
    846     if ($encoding !~ /(?:ascii|utf8|unicode)/) {
    847     $filemeta = unicode::unicode2utf8(
    848         unicode::convert2unicode($encoding, \$filemeta)
    849     );
    850     }
    851     $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
     910    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
     911    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
     912    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
     913    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
     914
     915    my $filemeta = $self->filename_to_metadata($file,$encoding);
     916    $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
    852917    if ($self->{'cover_image'}) {
    853918    $self->associate_cover_image($doc_obj, $filename);
Note: See TracChangeset for help on using the changeset viewer.