Changeset 1894 for trunk/gsdl/perllib/plugins
- Timestamp:
- 2001-02-01T17:43:27+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/SplitPlug.pm
r1676 r1894 109 109 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 110 110 111 my ($language, $encoding); 112 if ($self->{'input_encoding'} eq "auto") { 113 # use textcat to automatically work out the input encoding and language 114 ($language, $encoding) = $self->get_language_encoding ($filename); 115 116 } elsif ($self->{'extract_language'}) { 117 # use textcat to get language metadata 118 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 119 $encoding = $self->{'input_encoding'}; 120 121 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) { 122 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 123 print $outhandle "appears to be encoded as $extracted_encoding.\n"; 124 } 125 126 } else { 127 $language = $self->{'default_language'}; 128 $encoding = $self->{'input_encoding'}; 129 } 130 111 131 # Read in file ($text will be in utf8) 112 132 my $text = ""; 113 $self->read_file ($filename, \$text);133 $self->read_file ($filename, $encoding, \$text); 114 134 115 135 if ($text !~ /\w/) { … … 119 139 return 0; 120 140 } 121 141 142 122 143 # Split the text into several smaller segments 123 144 my $split_exp = $self->{'split_exp'}; … … 135 156 # create a new document 136 157 my $doc_obj = new doc ($filename, "indexed_doc"); 158 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 159 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 137 160 138 161 # Calculate a "base" document ID.
Note:
See TracChangeset
for help on using the changeset viewer.