Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm@ 23363

Last change on this file since 23363 was 23363, checked in by davidb, 13 years ago
Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)
Property svn:executable set to ``*
File size: 19.2 KB

Line
1	###########################################################################
2	#
3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package ReadTextFile;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32	use Encode;
33
34	use multiread;
35	use encodings;
36	use unicode;
37	use textcat;
38	use doc;
39	use ghtml;
40	use gsprintf 'gsprintf';
41
42	use AutoExtractMetadata;
43
44	sub BEGIN {
45	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
46	}
47
48	my $encoding_plus_auto_list =
49	[ { 'name' => "auto",
50	'desc' => "{ReadTextFile.input_encoding.auto}" } ];
51	push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
52
53	my $arguments =
54	[ { 'name' => "input_encoding",
55	'desc' => "{ReadTextFile.input_encoding}",
56	'type' => "enum",
57	'list' => $encoding_plus_auto_list,
58	'reqd' => "no" ,
59	'deft' => "auto" } ,
60	{ 'name' => "default_encoding",
61	'desc' => "{ReadTextFile.default_encoding}",
62	'type' => "enum",
63	'list' => $BasePlugin::encoding_list,
64	'reqd' => "no",
65	'deft' => "utf8" },
66	{ 'name' => "extract_language",
67	'desc' => "{ReadTextFile.extract_language}",
68	'type' => "flag",
69	'reqd' => "no" },
70	{ 'name' => "default_language",
71	'desc' => "{ReadTextFile.default_language}",
72	'type' => "string",
73	'deft' => "en",
74	'reqd' => "no" }
75	];
76
77
78	my $options = { 'name' => "ReadTextFile",
79	'desc' => "{ReadTextFile.desc}",
80	'abstract' => "yes",
81	'inherits' => "no",
82	'args' => $arguments };
83
84
85
86	sub new {
87	my $class = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
95
96	return bless $self, $class;
97
98	}
99
100
101
102	# The ReadTextFile read_into_doc_obj() function. This function does all the
103	# right things to make general options work for a given plugin. It reads in
104	# a file and sets up a slew of metadata all saved in doc_obj, which
105	# it then returns as part of a tuple (process_status,doc_obj)
106	#
107	# Much of this functionality used to reside in read, but it was broken
108	# down into a supporting routine to make the code more flexible.
109	#
110	# recursive plugins (e.g. RecPlug) and specialized plugins like those
111	# capable of processing many documents within a single file (e.g.
112	# GMLPlug) will normally want to implement their own version of
113	# read_into_doc_obj()
114	#
115	# Note that $base_dir might be "" and that $file might
116	# include directories
117	sub read_into_doc_obj {
118	my $self = shift (@_);
119	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
120
121	my $outhandle = $self->{'outhandle'};
122	# should we move this to read? What about secondary plugins?
123	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
124	my $pp_file = &util::prettyprint_file($base_dir,$file);
125	print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
126	if $self->{'verbosity'} > 1;
127
128	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
129
130	# Do encoding stuff
131	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
132	if ($self->{'verbosity'} > 2) {
133	print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n";
134	}
135
136	# create a new document
137	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
138	my $top_section = $doc_obj->get_top_section();
139
140	# this should look at the plugin option too...
141	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
142	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
143
144	my $plugin_filename_encoding = $self->{'filename_encoding'};
145	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
146	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
147
148	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
149	$doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding);
150
151	# read in file ($text will be in utf8)
152	my $text = "";
153	$self->read_file ($filename_full_path, $content_encoding, $language, \$text);
154
155	if (!length ($text)) {
156	if ($gli) {
157	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
158	}
159	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
160
161	my $failhandle = $self->{'failhandle'};
162	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
163	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
164	$self->{'num_not_processed'} ++;
165
166	return (0,undef); # what should we return here?? error but don't want to pass it on
167	}
168
169	# do plugin specific processing of doc_obj
170	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
171	$text = '';
172	undef $text;
173	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
174	return (-1,undef);
175	}
176	$text='';
177	undef $text;
178
179	# include any metadata passed in from previous plugins
180	# note that this metadata is associated with the top level section
181	$self->add_associated_files($doc_obj, $filename_full_path);
182	$self->extra_metadata ($doc_obj, $top_section, $metadata);
183
184	# do any automatic metadata extraction
185	$self->auto_extract_metadata ($doc_obj);
186
187
188	# if we haven't found any Title so far, assign one
189	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
190
191	$self->add_OID($doc_obj);
192
193	return (1,$doc_obj);
194	}
195
196	# uses the multiread package to read in the entire file pointed to
197	# by filename and loads the resulting text into $$textref. Input text
198	# may be in any of the encodings handled by multiread, output text
199	# will be in utf8
200	sub read_file {
201	my $self = shift (@_);
202	my ($filename, $encoding, $language, $textref) = @_;
203
204	if (!-r $filename)
205	{
206	my $outhandle = $self->{'outhandle'};
207	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
208	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
209	return;
210	}
211	$$textref = "";
212	if (!open (FILE, $filename)) {
213	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
214	die "\n";
215	}
216
217	if ($encoding eq "ascii") {
218	undef $/;
219	$$textref = <FILE>;
220	$/ = "\n";
221	} else {
222	my $reader = new multiread();
223	$reader->set_handle ('ReadTextFile::FILE');
224	$reader->set_encoding ($encoding);
225	$reader->read_file ($textref);
226	}
227
228	# At this point $$textref is a binary byte string
229	# => turn it into a Unicode aware string, so full
230	# Unicode aware pattern matching can be used.
231	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
232	#
233
234	$$textref = decode("utf8",$$textref);
235
236	close FILE;
237	}
238
239
240	# Not currently used
241	sub UNUSED_read_file_usingPerlsEncodeModule {
242	##sub read_file {
243	my $self = shift (@_);
244	my ($filename, $encoding, $language, $textref) = @_;
245
246	if (!-r $filename)
247	{
248	my $outhandle = $self->{'outhandle'};
249	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
250	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
251	return;
252	}
253	$$textref = "";
254	if (!open (FILE, $filename)) {
255	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
256	or_reading} ($!)\n", $filename);
257	die "\n";
258	}
259
260	my $store_slash = $/;
261	undef $/;
262	my $text = <FILE>;
263	$/ = $store_slash;
264
265	$$textref = decode($encoding,$text);
266
267	close FILE;
268	}
269
270
271	sub read_file_no_decoding {
272	my $self = shift (@_);
273	my ($filename, $textref) = @_;
274
275	if (!-r $filename)
276	{
277	my $outhandle = $self->{'outhandle'};
278	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
279	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
280	return;
281	}
282	$$textref = "";
283	if (!open (FILE, $filename)) {
284	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
285	die "\n";
286	}
287
288	my $reader = new multiread();
289	$reader->set_handle ('ReadTextFile::FILE');
290	$reader->read_file_no_decoding ($textref);
291
292	$self->{'reader'} = $reader;
293
294	close FILE;
295	}
296
297
298	sub decode_text {
299	my $self = shift (@_);
300	my ($raw_text, $encoding, $language, $textref) = @_;
301
302	my $reader = $self->{'reader'};
303	if (!defined $reader) {
304	gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
305	}
306	else {
307	$reader->set_encoding($encoding);
308	$reader->decode_text($raw_text,$textref);
309	}
310	}
311
312
313	sub textcat_get_language_encoding {
314	my $self = shift (@_);
315	my ($filename) = @_;
316
317	my ($language, $encoding, $extracted_encoding);
318	if ($self->{'input_encoding'} eq "auto") {
319	# use textcat to automatically work out the input encoding and language
320	($language, $encoding) = $self->get_language_encoding ($filename);
321	} elsif ($self->{'extract_language'}) {
322	# use textcat to get language metadata
323	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
324	$encoding = $self->{'input_encoding'};
325	# don't print this message for english... english in utf8 is identical
326	# to english in iso-8859-1 (except for some punctuation). We don't have
327	# a language model for en_utf8, so textcat always says iso-8859-1!
328	if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
329	my $plugin_name = ref ($self);
330	my $outhandle = $self->{'outhandle'};
331	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
332	}
333	} else {
334	$language = $self->{'default_language'};
335	$encoding = $self->{'input_encoding'};
336	}
337
338	# print STDERR "** language encoding of contents of file $filename:\n\t**$language $encoding\n";
339
340	return ($language, $encoding);
341	}
342
343
344	# Uses textcat to work out the encoding and language of the text in
345	# $filename. All html tags are removed before processing.
346	# returns an array containing "language" and "encoding"
347	sub get_language_encoding {
348	my $self = shift (@_);
349	my ($filename) = @_;
350	my $outhandle = $self->{'outhandle'};
351	my $unicode_format = "";
352	my $best_language = "";
353	my $best_encoding = "";
354
355
356	# read in file
357	if (!open (FILE, $filename)) {
358	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
359	# this is a pretty bad error, but try to continue anyway
360	return ($self->{'default_language'}, $self->{'input_encoding'});
361	}
362	undef $/;
363	my $text = <FILE>;
364	$/ = "\n";
365	close FILE;
366
367	# check if first few bytes have a Byte Order Marker
368	my $bom=substr($text,0,2); # check 16bit unicode
369	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
370	$unicode_format="unicode";
371	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
372	$unicode_format="unicode";
373	} else {
374	$bom=substr($text,0,3); # check utf-8
375	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
376	$unicode_format="utf8";
377	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
378	# $unicode_format="utf8";
379	}
380	}
381
382	my $found_html_encoding = 0;
383	# handle html files specially
384	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
385	if (ref($self) eq 'HTMLPlugin' \|\|
386	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
387
388	# remove comments in head, including multiline ones, so that we don't match on
389	# inactive tags (those that are nested inside comments)
390	my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
391	$head = "" unless defined $head; # some files are not proper HTML eg php files
392	$head =~ s/<!--.*?-->//sg;
393
394	# remove <title>stuff</title> -- as titles tend often to be in English
395	# for foreign language documents
396	$text =~ s!<title>.*?</title>!!si;
397
398	# see if this html file specifies its encoding
399	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
400	$best_encoding = $1;
401	}
402	# check the meta http-equiv charset tag
403	elsif ($head =~ m/<meta http-equiv.content-type.charset=(.+?)\"/si) {
404	$best_encoding = $1;
405	}
406	if ($best_encoding) { # we extracted an encoding
407	$best_encoding =~ s/-+/_/g;
408	$best_encoding = lc($best_encoding); # lowercase
409	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
410	$found_html_encoding = 1;
411	# We shouldn't be modifying this here!!
412	#$self->{'input_encoding'} = $best_encoding;
413	}
414
415	# remove all HTML tags
416	$text =~ s/<[^>]*>//sg;
417	}
418
419	# don't need to do textcat if we know the encoding now AND don't need to extract language
420	if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
421	$best_language = $self->{'default_language'};
422	}
423
424	else { # need to use textcat to get either the language, or get both language and encoding
425	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
426
427	if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
428	my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
429
430	my $language;
431	($language) = $results->[0] =~ m/^([^-])(?:-(?:.))?$/ if (scalar @$results > 0);
432
433	if (!defined $language \|\| scalar @$results > 3) {
434	# if there were too many results even when restricting results by encoding,
435	# or if there were no results, use default language with the known encoding
436	$best_language = $self->use_default_language($filename);
437	}
438	else { # fewer than 3 results means textcat is more certain, use the first result
439	$best_language = $language;
440	}
441	}
442	else { # don't know encoding or language yet, therefore we use textcat
443	my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
444
445	# if textcat returns 3 or less possibilities we'll use the first one in the list
446	if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
447	my ($language, $encoding) = $results->[0] =~ m/^([^-])(?:-(.))?$/;
448
449	$language = $self->use_default_language($filename) unless defined $language;
450	$encoding = $self->use_default_encoding($filename) unless defined $encoding;
451
452	$best_language = $language;
453	$best_encoding = $encoding;
454	}
455	else { # if (scalar @$results > 3) {
456	if ($unicode_format) { # in case the first had a BOM
457	$best_encoding=$unicode_format;
458	}
459	else {
460	# Find the most frequent encoding in the textcat results returned
461	# Returns "" if there's no encoding more frequent than another
462	$best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
463	}
464
465	if ($best_encoding eq "") { # encoding still not set, use defaults
466	$best_language = $self->use_default_language($filename);
467	$best_encoding = $self->use_default_encoding($filename);
468	}
469	elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
470	$best_language = $self->use_default_language($filename);
471	}
472	else { # textcat again using the most frequent encoding or the $unicode_format set above
473	$results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
474	my $language;
475	($language) = $results->[0] =~ m/^([^-])(?:-(.))?$/ if (scalar @$results > 0);
476	if (!defined $language \|\| scalar @$results > 3) {
477	# if no result or too many results, use default language for the encoding previously found
478	$best_language = $self->use_default_language($filename);
479	}
480	else { # fewer than 3 results, use the language of the first result
481	$best_language = $language;
482	}
483	}
484	}
485	}
486	}
487
488	if($best_encoding eq "" \|\| $best_language eq "") {
489	print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
490	$best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
491	$best_language = $self->use_default_language($filename) if $best_language eq "";
492	}
493	# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
494
495
496	if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
497	# the text is valid utf8, so assume that's the real encoding
498	# (since textcat is based on probabilities)
499	$best_encoding = 'utf8';
500	}
501
502	# check for equivalents where textcat doesn't have some encodings...
503	# eg MS versions of standard encodings
504	if ($best_encoding =~ /^iso_8859_(\d+)/) {
505	my $iso = $1; # which variant of the iso standard?
506	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
507	if ($text =~ /[\x80-\x9f]/) {
508	# Western Europe
509	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
510	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
511	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
512	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
513	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
514	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
515	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
516	}
517	}
518
519	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
520	!defined $encodings::encodings->{$best_encoding}) {
521	if ($self->{'verbosity'}) {
522	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
523	$filename, $best_encoding, $self->{'default_encoding'});
524	}
525	$best_encoding = $self->{'default_encoding'};
526	}
527
528	return ($best_language, $best_encoding);
529	}
530
531
532	sub use_default_language {
533	my $self = shift (@_);
534	my ($filename) = @_;
535
536	if ($self->{'verbosity'}>2) {
537	gsprintf($self->{'outhandle'},
538	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
539	$filename, $self->{'default_language'});
540	}
541	return $self->{'default_language'};
542	}
543
544	sub use_default_encoding {
545	my $self = shift (@_);
546	my ($filename) = @_;
547
548	if ($self->{'verbosity'}>2) {
549	gsprintf($self->{'outhandle'},
550	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
551	$filename, $self->{'default_encoding'});
552	}
553	return $self->{'default_encoding'};
554	}
555
556	# Overridden by exploding plugins (eg. ISISPlug)
557	sub clean_up_after_exploding
558	{
559	my $self = shift(@_);
560	}
561
562
563	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: