Changeset 1894

Show
Ignore:
Timestamp:
01.02.2001 17:43:27 (19 years ago)
Author:
jrm21
Message:

updated by copying BasPlug?'s new language/encoding stuff over for the read()
stuff.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/SplitPlug.pm

    r1676 r1894  
    109109    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 
    110110     
     111    my ($language, $encoding); 
     112    if ($self->{'input_encoding'} eq "auto") { 
     113    # use textcat to automatically work out the input encoding and language 
     114    ($language, $encoding) = $self->get_language_encoding ($filename); 
     115     
     116    } elsif ($self->{'extract_language'}) { 
     117    # use textcat to get language metadata 
     118    ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 
     119    $encoding = $self->{'input_encoding'}; 
     120     
     121    if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { 
     122        print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 
     123        print $outhandle "appears to be encoded as $extracted_encoding.\n"; 
     124    } 
     125     
     126    } else { 
     127    $language = $self->{'default_language'}; 
     128    $encoding = $self->{'input_encoding'}; 
     129    } 
     130 
    111131    # Read in file ($text will be in utf8) 
    112132    my $text = ""; 
    113     $self->read_file ($filename, \$text); 
     133    $self->read_file ($filename, $encoding, \$text); 
    114134 
    115135    if ($text !~ /\w/) { 
     
    119139    return 0; 
    120140    } 
    121  
     141     
     142     
    122143    # Split the text into several smaller segments 
    123144    my $split_exp = $self->{'split_exp'}; 
     
    135156    # create a new document 
    136157    my $doc_obj = new doc ($filename, "indexed_doc"); 
     158    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 
     159    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 
    137160 
    138161    # Calculate a "base" document ID.