source: main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:executable set to *
File size: 19.6 KB
Line 
1###########################################################################
2#
3# ReadTxtFile.pm -- base class for import plugins that have plain text files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadTextFile;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use Encode;
33
34use multiread;
35use encodings;
36use unicode;
37use textcat;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use AutoExtractMetadata;
43
44sub BEGIN {
45 @ReadTextFile::ISA = ( 'AutoExtractMetadata' );
46}
47
48my $encoding_plus_auto_list =
49 [ { 'name' => "auto",
50 'desc' => "{ReadTextFile.input_encoding.auto}" } ];
51push(@{$encoding_plus_auto_list},@{$CommonUtil::encoding_list});
52
53my $arguments =
54 [ { 'name' => "input_encoding",
55 'desc' => "{ReadTextFile.input_encoding}",
56 'type' => "enum",
57 'list' => $encoding_plus_auto_list,
58 'reqd' => "no" ,
59 'deft' => "auto" } ,
60 { 'name' => "default_encoding",
61 'desc' => "{ReadTextFile.default_encoding}",
62 'type' => "enum",
63 'list' => $CommonUtil::encoding_list,
64 'reqd' => "no",
65 'deft' => "utf8" },
66 { 'name' => "extract_language",
67 'desc' => "{ReadTextFile.extract_language}",
68 'type' => "flag",
69 'reqd' => "no" },
70 { 'name' => "default_language",
71 'desc' => "{ReadTextFile.default_language}",
72 'type' => "string",
73 'deft' => "en",
74 'reqd' => "no" }
75 ];
76
77
78my $options = { 'name' => "ReadTextFile",
79 'desc' => "{ReadTextFile.desc}",
80 'abstract' => "yes",
81 'inherits' => "no",
82 'args' => $arguments };
83
84
85
86sub new {
87 my $class = shift (@_);
88 my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
89 push(@$pluginlist, $class);
90
91 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92 push(@{$hashArgOptLists->{"OptList"}},$options);
93
94 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
95
96 return bless $self, $class;
97
98}
99
100
101
102# The ReadTextFile read_into_doc_obj() function. This function does all the
103# right things to make general options work for a given plugin. It reads in
104# a file and sets up a slew of metadata all saved in doc_obj, which
105# it then returns as part of a tuple (process_status,doc_obj)
106#
107# Much of this functionality used to reside in read, but it was broken
108# down into a supporting routine to make the code more flexible.
109#
110# recursive plugins (e.g. RecPlug) and specialized plugins like those
111# capable of processing many documents within a single file (e.g.
112# GMLPlug) will normally want to implement their own version of
113# read_into_doc_obj()
114#
115# Note that $base_dir might be "" and that $file might
116# include directories
117sub read_into_doc_obj {
118 my $self = shift (@_);
119 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
120
121 my $outhandle = $self->{'outhandle'};
122 # should we move this to read? What about secondary plugins?
123 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
124 my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
125 print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
126 if $self->{'verbosity'} > 1;
127
128 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
129
130 # Do encoding stuff
131 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
132 if ($self->{'verbosity'} > 2) {
133 print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n";
134 }
135
136 # create a new document
137 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
138 my $top_section = $doc_obj->get_top_section();
139
140 # this should look at the plugin option too...
141 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
142 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
143
144 my $plugin_filename_encoding = $self->{'filename_encoding'};
145 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
146 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
147
148 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
149 $doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding);
150
151 # read in file ($text will be in perl internal unicode aware format)
152 my $text = "";
153 $self->read_file ($filename_full_path, $content_encoding, $language, \$text);
154
155 if (!length ($text)) {
156 if ($gli) {
157 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
158 }
159 gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
160
161 my $failhandle = $self->{'failhandle'};
162 gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
163 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
164 $self->{'num_not_processed'} ++;
165
166 return (0,undef); # what should we return here?? error but don't want to pass it on
167 }
168
169 # do plugin specific processing of doc_obj
170 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
171 $text = '';
172 undef $text;
173 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
174 return (-1,undef);
175 }
176 $text='';
177 undef $text;
178
179 # include any metadata passed in from previous plugins
180 # note that this metadata is associated with the top level section
181 $self->add_associated_files($doc_obj, $filename_full_path);
182 $self->extra_metadata ($doc_obj, $top_section, $metadata);
183
184 # do any automatic metadata extraction
185 $self->auto_extract_metadata ($doc_obj);
186
187
188 # if we haven't found any Title so far, assign one
189 $self->title_fallback($doc_obj,$top_section,$filename_no_path);
190
191 $self->add_OID($doc_obj);
192
193 return (1,$doc_obj);
194}
195
196# uses the multiread package to read in the entire file pointed to
197# by filename and loads the resulting text into $$textref. Input text
198# may be in any of the encodings handled by multiread, output text
199# will be in utf8
200sub read_file {
201 my $self = shift (@_);
202 my ($filename, $encoding, $language, $textref) = @_;
203
204 if (!-r $filename)
205 {
206 my $outhandle = $self->{'outhandle'};
207 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
208 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
209 return;
210 }
211 $$textref = "";
212 if (!open (FILE, $filename)) {
213 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
214 die "\n";
215 }
216
217 if ($encoding eq "ascii") {
218 # Replace file 'slurp' with faster implementation
219 sysread(FILE, $$textref, -s FILE);
220
221 # The old slow way of reading in a file
222 #undef $/;
223 #$$textref = <FILE>;
224 #$/ = "\n";
225 } else {
226 my $reader = new multiread();
227 $reader->set_handle ('ReadTextFile::FILE');
228 $reader->set_encoding ($encoding);
229 $reader->read_file ($textref);
230 }
231
232 # At this point $$textref is a binary byte string
233 # => turn it into a Unicode aware string, so full
234 # Unicode aware pattern matching can be used.
235 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
236 #
237
238 $$textref = decode("utf8",$$textref);
239
240 close FILE;
241}
242
243
244# Not currently used
245sub UNUSED_read_file_usingPerlsEncodeModule {
246##sub read_file {
247 my $self = shift (@_);
248 my ($filename, $encoding, $language, $textref) = @_;
249
250 if (!-r $filename)
251 {
252 my $outhandle = $self->{'outhandle'};
253 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
254 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
255 return;
256 }
257 $$textref = "";
258 if (!open (FILE, $filename)) {
259 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
260or_reading} ($!)\n", $filename);
261 die "\n";
262 }
263
264 my $store_slash = $/;
265 undef $/;
266 my $text = <FILE>;
267 $/ = $store_slash;
268
269 $$textref = decode($encoding,$text);
270
271 close FILE;
272}
273
274
275sub read_file_no_decoding {
276 my $self = shift (@_);
277 my ($filename, $textref) = @_;
278
279 if (!-r $filename)
280 {
281 my $outhandle = $self->{'outhandle'};
282 gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
283 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
284 return;
285 }
286 $$textref = "";
287 if (!open (FILE, $filename)) {
288 gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
289 die "\n";
290 }
291
292 my $reader = new multiread();
293 $reader->set_handle ('ReadTextFile::FILE');
294 $reader->read_file_no_decoding ($textref);
295
296 $self->{'reader'} = $reader;
297
298 close FILE;
299}
300
301
302sub decode_text {
303 my $self = shift (@_);
304 my ($raw_text, $encoding, $language, $textref) = @_;
305
306 my $reader = $self->{'reader'};
307 if (!defined $reader) {
308 gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
309 }
310 else {
311 $reader->set_encoding($encoding);
312 $reader->decode_text($raw_text,$textref);
313
314 # At this point $$textref is a binary byte string
315 # => turn it into a Unicode aware string, so full
316 # Unicode aware pattern matching can be used.
317 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
318
319 $$textref = decode("utf8",$$textref);
320 }
321}
322
323
324sub textcat_get_language_encoding {
325 my $self = shift (@_);
326 my ($filename) = @_;
327
328 my ($language, $encoding, $extracted_encoding);
329 if ($self->{'input_encoding'} eq "auto") {
330 # use textcat to automatically work out the input encoding and language
331 ($language, $encoding) = $self->get_language_encoding ($filename);
332 } elsif ($self->{'extract_language'}) {
333 # use textcat to get language metadata
334 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
335 $encoding = $self->{'input_encoding'};
336 # don't print this message for english... english in utf8 is identical
337 # to english in iso-8859-1 (except for some punctuation). We don't have
338 # a language model for en_utf8, so textcat always says iso-8859-1!
339 if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
340 my $plugin_name = ref ($self);
341 my $outhandle = $self->{'outhandle'};
342 gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
343 }
344 } else {
345 $language = $self->{'default_language'};
346 $encoding = $self->{'input_encoding'};
347 }
348
349# print STDERR "**** language encoding of contents of file $filename:\n\t****$language $encoding\n";
350
351 return ($language, $encoding);
352}
353
354
355# Uses textcat to work out the encoding and language of the text in
356# $filename. All html tags are removed before processing.
357# returns an array containing "language" and "encoding"
358sub get_language_encoding {
359 my $self = shift (@_);
360 my ($filename) = @_;
361 my $outhandle = $self->{'outhandle'};
362 my $unicode_format = "";
363 my $best_language = "";
364 my $best_encoding = "";
365
366
367 # read in file
368 if (!open (FILE, $filename)) {
369 gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
370 # this is a pretty bad error, but try to continue anyway
371 return ($self->{'default_language'}, $self->{'input_encoding'});
372 }
373 undef $/;
374 my $text = <FILE>;
375 $/ = "\n";
376 close FILE;
377
378 # check if first few bytes have a Byte Order Marker
379 my $bom=substr($text,0,2); # check 16bit unicode
380 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
381 $unicode_format="unicode";
382 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
383 $unicode_format="unicode";
384 } else {
385 $bom=substr($text,0,3); # check utf-8
386 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
387 $unicode_format="utf8";
388# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
389# $unicode_format="utf8";
390 }
391 }
392
393 my $found_html_encoding = 0;
394 # handle html files specially
395 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
396 if (ref($self) eq 'HTMLPlugin' ||
397 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
398
399 # remove comments in head, including multiline ones, so that we don't match on
400 # inactive tags (those that are nested inside comments)
401 my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
402 $head = "" unless defined $head; # some files are not proper HTML eg php files
403 $head =~ s/<!--.*?-->//sg;
404
405 # remove <title>stuff</title> -- as titles tend often to be in English
406 # for foreign language documents
407 $text =~ s!<title>.*?</title>!!si;
408
409 # see if this html file specifies its encoding
410 if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
411 $best_encoding = $1;
412 }
413 # check the meta http-equiv charset tag
414 elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) {
415 $best_encoding = $1;
416 }
417 if ($best_encoding) { # we extracted an encoding
418 $best_encoding =~ s/-+/_/g;
419 $best_encoding = lc($best_encoding); # lowercase
420 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
421 $found_html_encoding = 1;
422 # We shouldn't be modifying this here!!
423 #$self->{'input_encoding'} = $best_encoding;
424 }
425
426 # remove all HTML tags
427 $text =~ s/<[^>]*>//sg;
428 }
429
430 # don't need to do textcat if we know the encoding now AND don't need to extract language
431 if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
432 $best_language = $self->{'default_language'};
433 }
434
435 else { # need to use textcat to get either the language, or get both language and encoding
436 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
437
438 if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
439 my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
440
441 my $language;
442 ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0);
443
444 if (!defined $language || scalar @$results > 3) {
445 # if there were too many results even when restricting results by encoding,
446 # or if there were no results, use default language with the known encoding
447 $best_language = $self->use_default_language($filename);
448 }
449 else { # fewer than 3 results means textcat is more certain, use the first result
450 $best_language = $language;
451 }
452 }
453 else { # don't know encoding or language yet, therefore we use textcat
454 my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
455
456 # if textcat returns 3 or less possibilities we'll use the first one in the list
457 if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
458 my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/;
459
460 $language = $self->use_default_language($filename) unless defined $language;
461 $encoding = $self->use_default_encoding($filename) unless defined $encoding;
462
463 $best_language = $language;
464 $best_encoding = $encoding;
465 }
466 else { # if (scalar @$results > 3) {
467 if ($unicode_format) { # in case the first had a BOM
468 $best_encoding=$unicode_format;
469 }
470 else {
471 # Find the most frequent encoding in the textcat results returned
472 # Returns "" if there's no encoding more frequent than another
473 $best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
474 }
475
476 if ($best_encoding eq "") { # encoding still not set, use defaults
477 $best_language = $self->use_default_language($filename);
478 $best_encoding = $self->use_default_encoding($filename);
479 }
480 elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
481 $best_language = $self->use_default_language($filename);
482 }
483 else { # textcat again using the most frequent encoding or the $unicode_format set above
484 $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
485 my $language;
486 ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0);
487 if (!defined $language || scalar @$results > 3) {
488 # if no result or too many results, use default language for the encoding previously found
489 $best_language = $self->use_default_language($filename);
490 }
491 else { # fewer than 3 results, use the language of the first result
492 $best_language = $language;
493 }
494 }
495 }
496 }
497 }
498
499 if($best_encoding eq "" || $best_language eq "") {
500 print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
501 $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
502 $best_language = $self->use_default_language($filename) if $best_language eq "";
503 }
504# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
505
506
507 if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
508 # the text is valid utf8, so assume that's the real encoding
509 # (since textcat is based on probabilities)
510 $best_encoding = 'utf8';
511 }
512
513 # check for equivalents where textcat doesn't have some encodings...
514 # eg MS versions of standard encodings
515 if ($best_encoding =~ /^iso_8859_(\d+)/) {
516 my $iso = $1; # which variant of the iso standard?
517 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
518 if ($text =~ /[\x80-\x9f]/) {
519 # Western Europe
520 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
521 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
522 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
523 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
524 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
525 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
526 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
527 }
528 }
529
530 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
531 !defined $encodings::encodings->{$best_encoding}) {
532 if ($self->{'verbosity'}) {
533 gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
534 $filename, $best_encoding, $self->{'default_encoding'});
535 }
536 $best_encoding = $self->{'default_encoding'};
537 }
538
539 return ($best_language, $best_encoding);
540}
541
542
543sub use_default_language {
544 my $self = shift (@_);
545 my ($filename) = @_;
546
547 if ($self->{'verbosity'}>2) {
548 gsprintf($self->{'outhandle'},
549 "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
550 $filename, $self->{'default_language'});
551 }
552 return $self->{'default_language'};
553}
554
555sub use_default_encoding {
556 my $self = shift (@_);
557 my ($filename) = @_;
558
559 if ($self->{'verbosity'}>2) {
560 gsprintf($self->{'outhandle'},
561 "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
562 $filename, $self->{'default_encoding'});
563 }
564 return $self->{'default_encoding'};
565}
566
567# Overridden by exploding plugins (eg. ISISPlug)
568sub clean_up_after_exploding
569{
570 my $self = shift(@_);
571}
572
573
5741;
Note: See TracBrowser for help on using the repository browser.