root/main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm @ 23387

Revision 23387, 19.5 KB (checked in by davidb, 9 years ago)

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

  • Property svn:executable set to *
Line 
1###########################################################################
2#
3# ReadTxtFile.pm -- base class for import plugins that have plain text files
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ReadTextFile;
27
28use strict;
29no strict 'subs';
30no strict 'refs'; # allow filehandles to be variables and viceversa
31
32use Encode;
33
34use multiread;
35use encodings;
36use unicode;
37use textcat;
38use doc;
39use ghtml;
40use gsprintf 'gsprintf';
41
42use AutoExtractMetadata;
43
44sub BEGIN {
45    @ReadTextFile::ISA = ( 'AutoExtractMetadata' );
46}
47
48my $encoding_plus_auto_list =
49    [ { 'name' => "auto",
50    'desc' => "{ReadTextFile.input_encoding.auto}" } ];
51push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
52
53my $arguments =
54    [ { 'name' => "input_encoding",
55    'desc' => "{ReadTextFile.input_encoding}",
56    'type' => "enum",
57    'list' => $encoding_plus_auto_list,
58    'reqd' => "no" ,
59    'deft' => "auto" } ,
60      { 'name' => "default_encoding",
61    'desc' => "{ReadTextFile.default_encoding}",
62    'type' => "enum",
63    'list' => $BasePlugin::encoding_list,
64    'reqd' => "no",
65        'deft' => "utf8" },
66      { 'name' => "extract_language",
67    'desc' => "{ReadTextFile.extract_language}",
68    'type' => "flag",
69    'reqd' => "no" },
70      { 'name' => "default_language",
71    'desc' => "{ReadTextFile.default_language}",
72    'type' => "string",
73    'deft' => "en",
74    'reqd' => "no" }
75      ];
76
77
78my $options = { 'name'     => "ReadTextFile",
79        'desc'     => "{ReadTextFile.desc}",
80        'abstract' => "yes",
81        'inherits' => "no",
82        'args'     => $arguments };
83
84
85
86sub new {
87    my $class = shift (@_);
88    my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
89    push(@$pluginlist, $class);
90
91    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92    push(@{$hashArgOptLists->{"OptList"}},$options);
93
94    my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
95
96    return bless $self, $class;
97   
98}
99
100
101
102# The ReadTextFile read_into_doc_obj() function. This function does all the
103# right things to make general options work for a given plugin.  It reads in
104# a file and sets up a slew of metadata all saved in doc_obj, which
105# it then returns as part of a tuple (process_status,doc_obj)
106#
107# Much of this functionality used to reside in read, but it was broken
108# down into a supporting routine to make the code more flexible. 
109#
110# recursive plugins (e.g. RecPlug) and specialized plugins like those
111# capable of processing many documents within a single file (e.g.
112# GMLPlug) will normally want to implement their own version of
113# read_into_doc_obj()
114#
115# Note that $base_dir might be "" and that $file might
116# include directories
117sub read_into_doc_obj {
118    my $self = shift (@_); 
119    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
120
121    my $outhandle = $self->{'outhandle'};
122    # should we move this to read? What about secondary plugins?
123    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
124    my $pp_file = &util::prettyprint_file($base_dir,$file);
125    print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
126        if $self->{'verbosity'} > 1;
127
128    my ($filename_full_path, $filename_no_path) =  &util::get_full_filenames($base_dir, $file);
129
130    # Do encoding stuff
131    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
132    if ($self->{'verbosity'} > 2) {
133    print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n";
134    }
135
136    # create a new document
137    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
138    my $top_section = $doc_obj->get_top_section();
139
140    # this should look at the plugin option too...
141    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
142    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
143
144    my $plugin_filename_encoding = $self->{'filename_encoding'};
145    my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
146    $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
147
148    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
149    $doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding);
150   
151    # read in file ($text will be in utf8)
152    my $text = "";
153    $self->read_file ($filename_full_path, $content_encoding, $language, \$text);
154
155    if (!length ($text)) {
156    if ($gli) {
157        print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
158    }
159    gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
160
161    my $failhandle = $self->{'failhandle'};
162    gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
163    # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
164    $self->{'num_not_processed'} ++;
165
166    return (0,undef); # what should we return here?? error but don't want to pass it on
167    }
168   
169    # do plugin specific processing of doc_obj
170    unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
171    $text = '';
172    undef $text;
173    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
174    return (-1,undef);
175    }
176    $text='';
177    undef $text;
178   
179    # include any metadata passed in from previous plugins
180    # note that this metadata is associated with the top level section
181    $self->add_associated_files($doc_obj, $filename_full_path);
182    $self->extra_metadata ($doc_obj, $top_section, $metadata);
183
184     # do any automatic metadata extraction
185    $self->auto_extract_metadata ($doc_obj);
186
187
188    # if we haven't found any Title so far, assign one
189    $self->title_fallback($doc_obj,$top_section,$filename_no_path);
190
191    $self->add_OID($doc_obj);
192   
193    return (1,$doc_obj);
194}
195
196# uses the multiread package to read in the entire file pointed to
197# by filename and loads the resulting text into $$textref. Input text
198# may be in any of the encodings handled by multiread, output text
199# will be in utf8
200sub read_file {
201    my $self = shift (@_);
202    my ($filename, $encoding, $language, $textref) = @_;
203
204    if (!-r $filename)
205    {
206    my $outhandle = $self->{'outhandle'};
207    gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
208    # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
209    return;
210    }
211    $$textref = "";
212    if (!open (FILE, $filename)) {
213    gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
214    die "\n";
215    }
216     
217    if ($encoding eq "ascii") {
218    undef $/;
219    $$textref = <FILE>;
220    $/ = "\n";
221    } else {
222    my $reader = new multiread();
223    $reader->set_handle ('ReadTextFile::FILE');
224    $reader->set_encoding ($encoding);
225    $reader->read_file ($textref);
226    }
227
228    # At this point $$textref is a binary byte string
229    # => turn it into a Unicode aware string, so full
230    # Unicode aware pattern matching can be used.
231    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
232    #
233
234    $$textref = decode("utf8",$$textref);
235
236    close FILE;
237}
238
239
240# Not currently used
241sub UNUSED_read_file_usingPerlsEncodeModule {
242##sub read_file {
243    my $self = shift (@_);
244    my ($filename, $encoding, $language, $textref) = @_;
245
246    if (!-r $filename)
247    {
248        my $outhandle = $self->{'outhandle'};
249        gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
250        # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
251        return;
252    }
253    $$textref = "";
254    if (!open (FILE, $filename)) {
255        gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
256or_reading} ($!)\n", $filename);
257        die "\n";
258    }
259
260    my $store_slash = $/;
261    undef $/;
262    my $text = <FILE>;
263    $/ = $store_slash;
264
265    $$textref = decode($encoding,$text);
266
267    close FILE;
268}
269
270
271sub read_file_no_decoding {
272    my $self = shift (@_);
273    my ($filename, $textref) = @_;
274
275    if (!-r $filename)
276    {
277    my $outhandle = $self->{'outhandle'};
278    gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
279    # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
280    return;
281    }
282    $$textref = "";
283    if (!open (FILE, $filename)) {
284    gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
285    die "\n";
286    }
287     
288    my $reader = new multiread();
289    $reader->set_handle ('ReadTextFile::FILE');
290    $reader->read_file_no_decoding ($textref);
291   
292    $self->{'reader'} = $reader;
293
294    close FILE;
295}
296
297
298sub decode_text {
299    my $self = shift (@_);
300    my ($raw_text, $encoding, $language, $textref) = @_;
301
302    my $reader = $self->{'reader'};
303    if (!defined $reader) {
304    gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
305    }
306    else {
307    $reader->set_encoding($encoding);
308    $reader->decode_text($raw_text,$textref);
309
310    # At this point $$textref is a binary byte string
311    # => turn it into a Unicode aware string, so full
312    # Unicode aware pattern matching can be used.
313    # For instance: 's/\x{0101}//g' or '[[:upper:]]'   
314   
315    $$textref = decode("utf8",$$textref);
316    }
317}
318
319
320sub textcat_get_language_encoding {
321    my $self = shift (@_);
322    my ($filename) = @_;
323
324    my ($language, $encoding, $extracted_encoding);
325    if ($self->{'input_encoding'} eq "auto") {
326        # use textcat to automatically work out the input encoding and language
327        ($language, $encoding) = $self->get_language_encoding ($filename);
328    } elsif ($self->{'extract_language'}) {
329    # use textcat to get language metadata
330        ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
331        $encoding = $self->{'input_encoding'};
332    # don't print this message for english... english in utf8 is identical
333    # to english in iso-8859-1 (except for some punctuation). We don't have
334    # a language model for en_utf8, so textcat always says iso-8859-1!
335        if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
336        my $plugin_name = ref ($self);
337        my $outhandle = $self->{'outhandle'};
338        gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
339        }
340    } else {
341        $language = $self->{'default_language'};
342        $encoding = $self->{'input_encoding'};
343    }
344   
345#    print STDERR "**** language encoding of contents of file $filename:\n\t****$language $encoding\n";
346
347    return ($language, $encoding);
348}
349
350
351# Uses textcat to work out the encoding and language of the text in
352# $filename. All html tags are removed before processing.
353# returns an array containing "language" and "encoding"
354sub get_language_encoding {
355    my $self = shift (@_);
356    my ($filename) = @_;
357    my $outhandle = $self->{'outhandle'};
358    my $unicode_format = "";
359    my $best_language = "";
360    my $best_encoding = "";
361   
362
363    # read in file
364    if (!open (FILE, $filename)) {
365    gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
366    # this is a pretty bad error, but try to continue anyway
367    return ($self->{'default_language'}, $self->{'input_encoding'});
368    }
369    undef $/;
370    my $text = <FILE>;
371    $/ = "\n";
372    close FILE;
373
374    # check if first few bytes have a Byte Order Marker
375    my $bom=substr($text,0,2); # check 16bit unicode
376    if ($bom eq "\xff\xfe") { # little endian 16bit unicode
377    $unicode_format="unicode";
378    } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
379    $unicode_format="unicode";
380    } else {
381    $bom=substr($text,0,3); # check utf-8
382    if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
383        $unicode_format="utf8";
384#   } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
385#       $unicode_format="utf8";
386    }
387    }
388   
389    my $found_html_encoding = 0;
390    # handle html files specially
391    # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
392    if (ref($self) eq 'HTMLPlugin' ||
393    (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
394
395    # remove comments in head, including multiline ones, so that we don't match on
396    # inactive tags (those that are nested inside comments)
397    my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
398    $head = "" unless defined $head; # some files are not proper HTML eg php files
399    $head =~ s/<!--.*?-->//sg;
400
401    # remove <title>stuff</title> -- as titles tend often to be in English
402    # for foreign language documents
403    $text =~ s!<title>.*?</title>!!si;
404
405    # see if this html file specifies its encoding
406    if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
407        $best_encoding = $1;
408    }
409    # check the meta http-equiv charset tag
410    elsif ($head =~ m/<meta http-equiv.*content-type.*charset=(.+?)\"/si) {           
411        $best_encoding = $1;
412    }
413    if ($best_encoding) { # we extracted an encoding
414        $best_encoding =~ s/-+/_/g;
415        $best_encoding = lc($best_encoding); # lowercase
416        if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
417        $found_html_encoding = 1;
418        # We shouldn't be modifying this here!!
419        #$self->{'input_encoding'} = $best_encoding;
420    }
421       
422    # remove all HTML tags
423    $text =~ s/<[^>]*>//sg;
424    }
425
426    # don't need to do textcat if we know the encoding now AND don't need to extract language
427    if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
428    $best_language = $self->{'default_language'};
429    }
430
431    else { # need to use textcat to get either the language, or get both language and encoding
432    $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
433   
434    if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
435        my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
436       
437        my $language;
438        ($language) = $results->[0] =~ m/^([^-]*)(?:-(?:.*))?$/ if (scalar @$results > 0);
439
440        if (!defined $language || scalar @$results > 3) {
441        # if there were too many results even when restricting results by encoding,
442        # or if there were no results, use default language with the known encoding
443        $best_language = $self->use_default_language($filename);
444        }
445        else { # fewer than 3 results means textcat is more certain, use the first result
446        $best_language = $language;     
447        }
448    }
449    else { # don't know encoding or language yet, therefore we use textcat
450        my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
451       
452        # if textcat returns 3 or less possibilities we'll use the first one in the list
453        if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
454        my ($language, $encoding) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/;
455
456        $language = $self->use_default_language($filename) unless defined $language;
457        $encoding = $self->use_default_encoding($filename) unless defined $encoding;
458
459        $best_language = $language;
460        $best_encoding = $encoding;
461        }
462        else { # if (scalar @$results > 3) {
463        if ($unicode_format) { # in case the first had a BOM
464            $best_encoding=$unicode_format;
465        }
466        else {
467            # Find the most frequent encoding in the textcat results returned
468            # Returns "" if there's no encoding more frequent than another
469            $best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
470        }
471       
472        if ($best_encoding eq "") { # encoding still not set, use defaults
473            $best_language = $self->use_default_language($filename);
474            $best_encoding = $self->use_default_encoding($filename);
475        }
476        elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
477            $best_language = $self->use_default_language($filename);
478        }
479        else { # textcat again using the most frequent encoding or the $unicode_format set above
480            $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
481            my $language;
482            ($language) = $results->[0] =~ m/^([^-]*)(?:-(.*))?$/ if (scalar @$results > 0);
483            if (!defined $language || scalar @$results > 3) {
484            # if no result or too many results, use default language for the encoding previously found
485            $best_language = $self->use_default_language($filename);
486            }
487            else { # fewer than 3 results, use the language of the first result
488            $best_language = $language;
489            }
490        }
491        }
492    }
493    }
494
495    if($best_encoding eq "" || $best_language eq "") {
496    print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
497    $best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
498    $best_language = $self->use_default_language($filename) if $best_language eq "";
499    }
500#    print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
501
502
503    if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
504    # the text is valid utf8, so assume that's the real encoding
505    # (since textcat is based on probabilities)
506    $best_encoding = 'utf8';
507    }
508
509    # check for equivalents where textcat doesn't have some encodings...
510    # eg MS versions of standard encodings
511    if ($best_encoding =~ /^iso_8859_(\d+)/) {
512    my $iso = $1; # which variant of the iso standard?
513    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
514    if ($text =~ /[\x80-\x9f]/) {
515        # Western Europe
516        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
517        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
518        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
519        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
520        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
521        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
522        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
523    }
524    }
525
526    if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
527    !defined $encodings::encodings->{$best_encoding}) {
528    if ($self->{'verbosity'}) {
529        gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
530             $filename, $best_encoding, $self->{'default_encoding'});
531    }
532    $best_encoding = $self->{'default_encoding'};
533    }
534
535    return ($best_language, $best_encoding);
536}
537
538
539sub use_default_language {
540    my $self = shift (@_);
541    my ($filename) = @_;
542
543    if ($self->{'verbosity'}>2) {
544    gsprintf($self->{'outhandle'},
545         "ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
546         $filename, $self->{'default_language'});
547    }
548    return $self->{'default_language'};
549}
550
551sub use_default_encoding {
552    my $self = shift (@_);
553    my ($filename) = @_;
554
555    if ($self->{'verbosity'}>2) {
556    gsprintf($self->{'outhandle'},
557         "ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
558         $filename, $self->{'default_encoding'});
559    }
560    return $self->{'default_encoding'};
561}
562
563# Overridden by exploding plugins (eg. ISISPlug)
564sub clean_up_after_exploding
565{
566    my $self = shift(@_);
567}
568
569
5701;
Note: See TracBrowser for help on using the browser.