Changeset 1894


Ignore:
Timestamp:
2001-02-01T17:43:27+13:00 (23 years ago)
Author:
jrm21
Message:

updated by copying BasPlug's new language/encoding stuff over for the read()
stuff.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/SplitPlug.pm

    r1676 r1894  
    109109    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
    110110   
     111    my ($language, $encoding);
     112    if ($self->{'input_encoding'} eq "auto") {
     113    # use textcat to automatically work out the input encoding and language
     114    ($language, $encoding) = $self->get_language_encoding ($filename);
     115   
     116    } elsif ($self->{'extract_language'}) {
     117    # use textcat to get language metadata
     118    ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
     119    $encoding = $self->{'input_encoding'};
     120   
     121    if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
     122        print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
     123        print $outhandle "appears to be encoded as $extracted_encoding.\n";
     124    }
     125   
     126    } else {
     127    $language = $self->{'default_language'};
     128    $encoding = $self->{'input_encoding'};
     129    }
     130
    111131    # Read in file ($text will be in utf8)
    112132    my $text = "";
    113     $self->read_file ($filename, \$text);
     133    $self->read_file ($filename, $encoding, \$text);
    114134
    115135    if ($text !~ /\w/) {
     
    119139    return 0;
    120140    }
    121 
     141   
     142   
    122143    # Split the text into several smaller segments
    123144    my $split_exp = $self->{'split_exp'};
     
    135156    # create a new document
    136157    my $doc_obj = new doc ($filename, "indexed_doc");
     158    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
     159    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
    137160
    138161    # Calculate a "base" document ID.
Note: See TracChangeset for help on using the changeset viewer.