source: main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm@ 22844

Last change on this file since 22844 was 22844, checked in by davidb, 14 years ago

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). Decoding the text read in at this point to UTF-8 means that (subsequent) regular expression processing of the data can make us Unicode aware operations, such as what Unicode classes as punctuation

  • Property svn:executable set to *
File size: 18.9 KB
RevLine 
[15868]1###########################################################################
2#
3# ReadTxtFile.pm -- base class for import plugins that have plain text files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadTextFile;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
[22844]32use Encode;
[15868]33
34use multiread;
35use encodings;
36use unicode;
37use textcat;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use AutoExtractMetadata;
43
44sub BEGIN {
45 @ReadTextFile::ISA = ( 'AutoExtractMetadata' );
46}
47
[16014]48my $encoding_plus_auto_list =
49 [ { 'name' => "auto",
50 'desc' => "{ReadTextFile.input_encoding.auto}" } ];
51push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
[15868]52
53my $arguments =
54 [ { 'name' => "input_encoding",
55 'desc' => "{ReadTextFile.input_encoding}",
56 'type' => "enum",
[16014]57 'list' => $encoding_plus_auto_list,
[15868]58 'reqd' => "no" ,
59 'deft' => "auto" } ,
60 { 'name' => "default_encoding",
61 'desc' => "{ReadTextFile.default_encoding}",
62 'type' => "enum",
63 'list' => $BasePlugin::encoding_list,
64 'reqd' => "no",
65 'deft' => "utf8" },
66 { 'name' => "extract_language",
67 'desc' => "{ReadTextFile.extract_language}",
68 'type' => "flag",
69 'reqd' => "no" },
70 { 'name' => "default_language",
71 'desc' => "{ReadTextFile.default_language}",
72 'type' => "string",
73 'deft' => "en",
[16642]74 'reqd' => "no" }
75 ];
[15868]76
77
78my $options = { 'name' => "ReadTextFile",
[16014]79 'desc' => "{ReadTextFile.desc}",
[15868]80 'abstract' => "yes",
81 'inherits' => "no",
82 'args' => $arguments };
83
84
85
86sub new {
87 my $class = shift (@_);
[16699]88 my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
[15868]89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
[16699]94 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
[15868]95
96 return bless $self, $class;
97
98}
99
100
101
102# The ReadTextFile read_into_doc_obj() function. This function does all the
103# right things to make general options work for a given plugin. It reads in
104# a file and sets up a slew of metadata all saved in doc_obj, which
105# it then returns as part of a tuple (process_status,doc_obj)
106#
107# Much of this functionality used to reside in read, but it was broken
108# down into a supporting routine to make the code more flexible.
109#
110# recursive plugins (e.g. RecPlug) and specialized plugins like those
111# capable of processing many documents within a single file (e.g.
112# GMLPlug) will normally want to implement their own version of
113# read_into_doc_obj()
114#
115# Note that $base_dir might be "" and that $file might
116# include directories
117sub read_into_doc_obj {
118 my $self = shift (@_);
[16392]119 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[15868]120
121 my $outhandle = $self->{'outhandle'};
122 # should we move this to read? What about secondary plugins?
123 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
124 print $outhandle "$self->{'plugin_type'} processing $file\n"
125 if $self->{'verbosity'} > 1;
126
[16392]127 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
128
[15868]129 # Do encoding stuff
130 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
131 if ($self->{'verbosity'} > 2) {
132 print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n";
133 }
134
135 # create a new document
[18320]136 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[15868]137 my $top_section = $doc_obj->get_top_section();
138
139 # this should look at the plugin option too...
140 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[16724]141 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
[15868]142 $self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
143
144 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
145 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
146
147 # read in file ($text will be in utf8)
148 my $text = "";
149 $self->read_file ($filename_full_path, $encoding, $language, \$text);
150
151 if (!length ($text)) {
152 if ($gli) {
153 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
154 }
155 gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
156
157 my $failhandle = $self->{'failhandle'};
158 gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
159 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
160 $self->{'num_not_processed'} ++;
161
162 return (0,undef); # what should we return here?? error but don't want to pass it on
163 }
164
165 # do plugin specific processing of doc_obj
166 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
167 $text = '';
168 undef $text;
169 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
170 return (-1,undef);
171 }
172 $text='';
173 undef $text;
174
175 # include any metadata passed in from previous plugins
176 # note that this metadata is associated with the top level section
177 $self->add_associated_files($doc_obj, $filename_full_path);
178 $self->extra_metadata ($doc_obj, $top_section, $metadata);
179
180 # do any automatic metadata extraction
181 $self->auto_extract_metadata ($doc_obj);
182
183
184 # if we haven't found any Title so far, assign one
185 $self->title_fallback($doc_obj,$top_section,$filename_no_path);
186
187 $self->add_OID($doc_obj);
188
189 return (1,$doc_obj);
190}
191
192# uses the multiread package to read in the entire file pointed to
193# by filename and loads the resulting text into $$textref. Input text
194# may be in any of the encodings handled by multiread, output text
195# will be in utf8
196sub read_file {
197 my $self = shift (@_);
198 my ($filename, $encoding, $language, $textref) = @_;
199
200 if (!-r $filename)
201 {
202 my $outhandle = $self->{'outhandle'};
203 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
204 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
205 return;
206 }
207 $$textref = "";
208 if (!open (FILE, $filename)) {
209 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
210 die "\n";
211 }
212
213 if ($encoding eq "ascii") {
214 undef $/;
215 $$textref = <FILE>;
216 $/ = "\n";
217 } else {
218 my $reader = new multiread();
219 $reader->set_handle ('ReadTextFile::FILE');
220 $reader->set_encoding ($encoding);
221 $reader->read_file ($textref);
222 }
[22844]223
224 # At this point $$testref is a binary byte string
225 # => turn it into a Unicode aware string, so full
226 # Unicode aware pattern matching can be used.
227 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
228 #
229
230 $$textref = decode("utf8",$$textref);
231
[15868]232 close FILE;
233}
234
235
[22844]236# Not currently used
237sub read_file_usingPerlsEncodeModule {
238 my $self = shift (@_);
239 my ($filename, $encoding, $language, $textref) = @_;
240
241 if (!-r $filename)
242 {
243 my $outhandle = $self->{'outhandle'};
244 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
245 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
246 return;
247 }
248 $$textref = "";
249 if (!open (FILE, $filename)) {
250 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
251or_reading} ($!)\n", $filename);
252 die "\n";
253 }
254
255 my $store_slash = $/;
256 undef $/;
257 my $text = <FILE>;
258 $/ = $store_slash;
259
260 $$textref = decode($encoding,$text);
261
262 close FILE;
263}
264
265
[16724]266sub read_file_no_decoding {
267 my $self = shift (@_);
268 my ($filename, $textref) = @_;
269
270 if (!-r $filename)
271 {
272 my $outhandle = $self->{'outhandle'};
273 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
274 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
275 return;
276 }
277 $$textref = "";
278 if (!open (FILE, $filename)) {
279 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
280 die "\n";
281 }
282
283 my $reader = new multiread();
284 $reader->set_handle ('ReadTextFile::FILE');
285 $reader->read_file_no_decoding ($textref);
286
287 $self->{'reader'} = $reader;
288
289 close FILE;
290}
291
292
293sub decode_text {
294 my $self = shift (@_);
295 my ($raw_text, $encoding, $language, $textref) = @_;
296
297 my $reader = $self->{'reader'};
298 if (!defined $reader) {
299 gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
300 }
301 else {
302 $reader->set_encoding($encoding);
303 $reader->decode_text($raw_text,$textref);
304 }
305}
306
307
[15868]308sub textcat_get_language_encoding {
309 my $self = shift (@_);
310 my ($filename) = @_;
311
312 my ($language, $encoding, $extracted_encoding);
313 if ($self->{'input_encoding'} eq "auto") {
314 # use textcat to automatically work out the input encoding and language
315 ($language, $encoding) = $self->get_language_encoding ($filename);
316 } elsif ($self->{'extract_language'}) {
[16724]317 # use textcat to get language metadata
[15868]318 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
319 $encoding = $self->{'input_encoding'};
320 # don't print this message for english... english in utf8 is identical
321 # to english in iso-8859-1 (except for some punctuation). We don't have
322 # a language model for en_utf8, so textcat always says iso-8859-1!
[16724]323 if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
[15868]324 my $plugin_name = ref ($self);
325 my $outhandle = $self->{'outhandle'};
326 gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
327 }
328 } else {
329 $language = $self->{'default_language'};
[16724]330 $encoding = $self->{'input_encoding'};
[15868]331 }
[16724]332
333# print STDERR "**** language encoding of contents of file $filename:\n\t****$language $encoding\n";
[15868]334
335 return ($language, $encoding);
336}
337
[17213]338
[15868]339# Uses textcat to work out the encoding and language of the text in
340# $filename. All html tags are removed before processing.
341# returns an array containing "language" and "encoding"
342sub get_language_encoding {
343 my $self = shift (@_);
344 my ($filename) = @_;
345 my $outhandle = $self->{'outhandle'};
346 my $unicode_format = "";
347 my $best_language = "";
348 my $best_encoding = "";
349
[17213]350
[15868]351 # read in file
352 if (!open (FILE, $filename)) {
353 gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
354 # this is a pretty bad error, but try to continue anyway
355 return ($self->{'default_language'}, $self->{'input_encoding'});
356 }
357 undef $/;
358 my $text = <FILE>;
359 $/ = "\n";
360 close FILE;
361
362 # check if first few bytes have a Byte Order Marker
363 my $bom=substr($text,0,2); # check 16bit unicode
364 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
365 $unicode_format="unicode";
366 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
367 $unicode_format="unicode";
368 } else {
369 $bom=substr($text,0,3); # check utf-8
370 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
371 $unicode_format="utf8";
372# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
373# $unicode_format="utf8";
374 }
375 }
376
[16667]377 my $found_html_encoding = 0;
[15868]378 # handle html files specially
379 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
[15970]380 if (ref($self) eq 'HTMLPlugin' ||
[15868]381 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
382
[16765]383 # remove comments in head, including multiline ones, so that we don't match on
[16753]384 # inactive tags (those that are nested inside comments)
[16765]385 my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
[17099]386 $head = "" unless defined $head; # some files are not proper HTML eg php files
[16765]387 $head =~ s/<!--.*?-->//sg;
[16753]388
[15868]389 # remove <title>stuff</title> -- as titles tend often to be in English
390 # for foreign language documents
391 $text =~ s!<title>.*?</title>!!si;
392
393 # see if this html file specifies its encoding
394 if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
395 $best_encoding = $1;
[16724]396 }
[16765]397 # check the meta http-equiv charset tag
398 elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) {
[15868]399 $best_encoding = $1;
400 }
401 if ($best_encoding) { # we extracted an encoding
402 $best_encoding =~ s/-+/_/g;
403 $best_encoding = lc($best_encoding); # lowercase
404 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
[16667]405 $found_html_encoding = 1;
406 # We shouldn't be modifying this here!!
407 #$self->{'input_encoding'} = $best_encoding;
[15868]408 }
409
410 # remove all HTML tags
411 $text =~ s/<[^>]*>//sg;
412 }
413
[17213]414 # don't need to do textcat if we know the encoding now AND don't need to extract language
415 if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
416 $best_language = $self->{'default_language'};
417 }
[15868]418
[17213]419 else { # need to use textcat to get either the language, or get both language and encoding
420 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
421
422 if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
423 my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
424
425 my $language;
426 ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0);
[15868]427
[17213]428 if (!defined $language || scalar @$results > 3) {
429 # if there were too many results even when restricting results by encoding,
430 # or if there were no results, use default language with the known encoding
431 $best_language = $self->use_default_language($filename);
432 }
433 else { # fewer than 3 results means textcat is more certain, use the first result
434 $best_language = $language;
435 }
[15868]436 }
[17213]437 else { # don't know encoding or language yet, therefore we use textcat
438 my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
439
440 # if textcat returns 3 or less possibilities we'll use the first one in the list
441 if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
442 my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/;
[15868]443
[17213]444 $language = $self->use_default_language($filename) unless defined $language;
445 $encoding = $self->use_default_encoding($filename) unless defined $encoding;
[15868]446
[17213]447 $best_language = $language;
448 $best_encoding = $encoding;
[15868]449 }
[17213]450 else { # if (scalar @$results > 3) {
451 if ($unicode_format) { # in case the first had a BOM
452 $best_encoding=$unicode_format;
453 }
454 else {
455 # Find the most frequent encoding in the textcat results returned
456 # Returns "" if there's no encoding more frequent than another
457 $best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
458 }
459
460 if ($best_encoding eq "") { # encoding still not set, use defaults
461 $best_language = $self->use_default_language($filename);
462 $best_encoding = $self->use_default_encoding($filename);
463 }
464 elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
465 $best_language = $self->use_default_language($filename);
466 }
467 else { # textcat again using the most frequent encoding or the $unicode_format set above
468 $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
469 my $language;
470 ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0);
471 if (!defined $language || scalar @$results > 3) {
472 # if no result or too many results, use default language for the encoding previously found
473 $best_language = $self->use_default_language($filename);
474 }
475 else { # fewer than 3 results, use the language of the first result
476 $best_language = $language;
477 }
478 }
[15868]479 }
480 }
481 }
482
[17213]483 if($best_encoding eq "" || $best_language eq "") {
484 print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
485 $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
486 $best_language = $self->use_default_language($filename) if $best_language eq "";
487 }
488# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
489
490
[16555]491 if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
[15868]492 # the text is valid utf8, so assume that's the real encoding
493 # (since textcat is based on probabilities)
494 $best_encoding = 'utf8';
495 }
496
497 # check for equivalents where textcat doesn't have some encodings...
498 # eg MS versions of standard encodings
499 if ($best_encoding =~ /^iso_8859_(\d+)/) {
500 my $iso = $1; # which variant of the iso standard?
501 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
502 if ($text =~ /[\x80-\x9f]/) {
503 # Western Europe
504 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
505 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
506 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
507 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
508 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
509 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
510 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
511 }
512 }
513
514 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
515 !defined $encodings::encodings->{$best_encoding}) {
516 if ($self->{'verbosity'}) {
517 gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
518 $filename, $best_encoding, $self->{'default_encoding'});
519 }
520 $best_encoding = $self->{'default_encoding'};
521 }
522
523 return ($best_language, $best_encoding);
524}
525
526
[17213]527sub use_default_language {
528 my $self = shift (@_);
529 my ($filename) = @_;
[15868]530
[17213]531 if ($self->{'verbosity'}>2) {
532 gsprintf($self->{'outhandle'},
533 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
534 $filename, $self->{'default_language'});
535 }
536 return $self->{'default_language'};
537}
538
539sub use_default_encoding {
540 my $self = shift (@_);
541 my ($filename) = @_;
542
543 if ($self->{'verbosity'}>2) {
544 gsprintf($self->{'outhandle'},
545 "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
546 $filename, $self->{'default_encoding'});
547 }
548 return $self->{'default_encoding'};
549}
550
[15868]551# Overridden by exploding plugins (eg. ISISPlug)
552sub clean_up_after_exploding
553{
554 my $self = shift(@_);
555}
556
557
5581;
Note: See TracBrowser for help on using the repository browser.