Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm@ 23335

Last change on this file since 23335 was 23335, checked in by davidb, 13 years ago
Work done on improving handing of filenames when the actualy filename encoding used is not necesarrily known. Tested for Linux. Work currently includes some debug statements that will be removed once testing for Windows and Mac is done.
Property svn:executable set to ``*
File size: 19.1 KB

Line
1	###########################################################################
2	#
3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package ReadTextFile;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32	use Encode;
33
34	use multiread;
35	use encodings;
36	use unicode;
37	use textcat;
38	use doc;
39	use ghtml;
40	use gsprintf 'gsprintf';
41
42	use AutoExtractMetadata;
43
44	sub BEGIN {
45	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
46	}
47
48	my $encoding_plus_auto_list =
49	[ { 'name' => "auto",
50	'desc' => "{ReadTextFile.input_encoding.auto}" } ];
51	push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
52
53	my $arguments =
54	[ { 'name' => "input_encoding",
55	'desc' => "{ReadTextFile.input_encoding}",
56	'type' => "enum",
57	'list' => $encoding_plus_auto_list,
58	'reqd' => "no" ,
59	'deft' => "auto" } ,
60	{ 'name' => "default_encoding",
61	'desc' => "{ReadTextFile.default_encoding}",
62	'type' => "enum",
63	'list' => $BasePlugin::encoding_list,
64	'reqd' => "no",
65	'deft' => "utf8" },
66	{ 'name' => "extract_language",
67	'desc' => "{ReadTextFile.extract_language}",
68	'type' => "flag",
69	'reqd' => "no" },
70	{ 'name' => "default_language",
71	'desc' => "{ReadTextFile.default_language}",
72	'type' => "string",
73	'deft' => "en",
74	'reqd' => "no" }
75	];
76
77
78	my $options = { 'name' => "ReadTextFile",
79	'desc' => "{ReadTextFile.desc}",
80	'abstract' => "yes",
81	'inherits' => "no",
82	'args' => $arguments };
83
84
85
86	sub new {
87	my $class = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
95
96	return bless $self, $class;
97
98	}
99
100
101
102	# The ReadTextFile read_into_doc_obj() function. This function does all the
103	# right things to make general options work for a given plugin. It reads in
104	# a file and sets up a slew of metadata all saved in doc_obj, which
105	# it then returns as part of a tuple (process_status,doc_obj)
106	#
107	# Much of this functionality used to reside in read, but it was broken
108	# down into a supporting routine to make the code more flexible.
109	#
110	# recursive plugins (e.g. RecPlug) and specialized plugins like those
111	# capable of processing many documents within a single file (e.g.
112	# GMLPlug) will normally want to implement their own version of
113	# read_into_doc_obj()
114	#
115	# Note that $base_dir might be "" and that $file might
116	# include directories
117	sub read_into_doc_obj {
118	my $self = shift (@_);
119	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
120
121	my $outhandle = $self->{'outhandle'};
122	# should we move this to read? What about secondary plugins?
123	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
124	print $outhandle "$self->{'plugin_type'} processing $file\n"
125	if $self->{'verbosity'} > 1;
126
127	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
128
129	# Do encoding stuff
130	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
131	if ($self->{'verbosity'} > 2) {
132	print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n";
133	}
134
135	# create a new document
136	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
137	my $top_section = $doc_obj->get_top_section();
138
139	# this should look at the plugin option too...
140	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
141	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
142
143	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata);
144	$self->set_Source_metadata($doc_obj, $filename_no_path, $filename_encoding);
145
146	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
147	$doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding);
148
149	# read in file ($text will be in utf8)
150	my $text = "";
151	$self->read_file ($filename_full_path, $content_encoding, $language, \$text);
152
153	if (!length ($text)) {
154	if ($gli) {
155	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
156	}
157	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
158
159	my $failhandle = $self->{'failhandle'};
160	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
161	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
162	$self->{'num_not_processed'} ++;
163
164	return (0,undef); # what should we return here?? error but don't want to pass it on
165	}
166
167	# do plugin specific processing of doc_obj
168	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
169	$text = '';
170	undef $text;
171	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
172	return (-1,undef);
173	}
174	$text='';
175	undef $text;
176
177	# include any metadata passed in from previous plugins
178	# note that this metadata is associated with the top level section
179	$self->add_associated_files($doc_obj, $filename_full_path);
180	$self->extra_metadata ($doc_obj, $top_section, $metadata);
181
182	# do any automatic metadata extraction
183	$self->auto_extract_metadata ($doc_obj);
184
185
186	# if we haven't found any Title so far, assign one
187	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
188
189	$self->add_OID($doc_obj);
190
191	return (1,$doc_obj);
192	}
193
194	# uses the multiread package to read in the entire file pointed to
195	# by filename and loads the resulting text into $$textref. Input text
196	# may be in any of the encodings handled by multiread, output text
197	# will be in utf8
198	sub read_file {
199	my $self = shift (@_);
200	my ($filename, $encoding, $language, $textref) = @_;
201
202	if (!-r $filename)
203	{
204	my $outhandle = $self->{'outhandle'};
205	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
206	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
207	return;
208	}
209	$$textref = "";
210	if (!open (FILE, $filename)) {
211	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
212	die "\n";
213	}
214
215	if ($encoding eq "ascii") {
216	undef $/;
217	$$textref = <FILE>;
218	$/ = "\n";
219	} else {
220	my $reader = new multiread();
221	$reader->set_handle ('ReadTextFile::FILE');
222	$reader->set_encoding ($encoding);
223	$reader->read_file ($textref);
224	}
225
226	# At this point $$textref is a binary byte string
227	# => turn it into a Unicode aware string, so full
228	# Unicode aware pattern matching can be used.
229	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
230	#
231
232	$$textref = decode("utf8",$$textref);
233
234	close FILE;
235	}
236
237
238	# Not currently used
239	sub UNUSED_read_file_usingPerlsEncodeModule {
240	##sub read_file {
241	my $self = shift (@_);
242	my ($filename, $encoding, $language, $textref) = @_;
243
244	if (!-r $filename)
245	{
246	my $outhandle = $self->{'outhandle'};
247	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
248	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
249	return;
250	}
251	$$textref = "";
252	if (!open (FILE, $filename)) {
253	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
254	or_reading} ($!)\n", $filename);
255	die "\n";
256	}
257
258	my $store_slash = $/;
259	undef $/;
260	my $text = <FILE>;
261	$/ = $store_slash;
262
263	$$textref = decode($encoding,$text);
264
265	close FILE;
266	}
267
268
269	sub read_file_no_decoding {
270	my $self = shift (@_);
271	my ($filename, $textref) = @_;
272
273	if (!-r $filename)
274	{
275	my $outhandle = $self->{'outhandle'};
276	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
277	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
278	return;
279	}
280	$$textref = "";
281	if (!open (FILE, $filename)) {
282	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
283	die "\n";
284	}
285
286	my $reader = new multiread();
287	$reader->set_handle ('ReadTextFile::FILE');
288	$reader->read_file_no_decoding ($textref);
289
290	$self->{'reader'} = $reader;
291
292	close FILE;
293	}
294
295
296	sub decode_text {
297	my $self = shift (@_);
298	my ($raw_text, $encoding, $language, $textref) = @_;
299
300	my $reader = $self->{'reader'};
301	if (!defined $reader) {
302	gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
303	}
304	else {
305	$reader->set_encoding($encoding);
306	$reader->decode_text($raw_text,$textref);
307	}
308	}
309
310
311	sub textcat_get_language_encoding {
312	my $self = shift (@_);
313	my ($filename) = @_;
314
315	my ($language, $encoding, $extracted_encoding);
316	if ($self->{'input_encoding'} eq "auto") {
317	# use textcat to automatically work out the input encoding and language
318	($language, $encoding) = $self->get_language_encoding ($filename);
319	} elsif ($self->{'extract_language'}) {
320	# use textcat to get language metadata
321	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
322	$encoding = $self->{'input_encoding'};
323	# don't print this message for english... english in utf8 is identical
324	# to english in iso-8859-1 (except for some punctuation). We don't have
325	# a language model for en_utf8, so textcat always says iso-8859-1!
326	if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
327	my $plugin_name = ref ($self);
328	my $outhandle = $self->{'outhandle'};
329	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
330	}
331	} else {
332	$language = $self->{'default_language'};
333	$encoding = $self->{'input_encoding'};
334	}
335
336	# print STDERR "** language encoding of contents of file $filename:\n\t**$language $encoding\n";
337
338	return ($language, $encoding);
339	}
340
341
342	# Uses textcat to work out the encoding and language of the text in
343	# $filename. All html tags are removed before processing.
344	# returns an array containing "language" and "encoding"
345	sub get_language_encoding {
346	my $self = shift (@_);
347	my ($filename) = @_;
348	my $outhandle = $self->{'outhandle'};
349	my $unicode_format = "";
350	my $best_language = "";
351	my $best_encoding = "";
352
353
354	# read in file
355	if (!open (FILE, $filename)) {
356	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
357	# this is a pretty bad error, but try to continue anyway
358	return ($self->{'default_language'}, $self->{'input_encoding'});
359	}
360	undef $/;
361	my $text = <FILE>;
362	$/ = "\n";
363	close FILE;
364
365	# check if first few bytes have a Byte Order Marker
366	my $bom=substr($text,0,2); # check 16bit unicode
367	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
368	$unicode_format="unicode";
369	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
370	$unicode_format="unicode";
371	} else {
372	$bom=substr($text,0,3); # check utf-8
373	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
374	$unicode_format="utf8";
375	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
376	# $unicode_format="utf8";
377	}
378	}
379
380	my $found_html_encoding = 0;
381	# handle html files specially
382	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
383	if (ref($self) eq 'HTMLPlugin' \|\|
384	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
385
386	# remove comments in head, including multiline ones, so that we don't match on
387	# inactive tags (those that are nested inside comments)
388	my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
389	$head = "" unless defined $head; # some files are not proper HTML eg php files
390	$head =~ s/<!--.*?-->//sg;
391
392	# remove <title>stuff</title> -- as titles tend often to be in English
393	# for foreign language documents
394	$text =~ s!<title>.*?</title>!!si;
395
396	# see if this html file specifies its encoding
397	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
398	$best_encoding = $1;
399	}
400	# check the meta http-equiv charset tag
401	elsif ($head =~ m/<meta http-equiv.content-type.charset=(.+?)\"/si) {
402	$best_encoding = $1;
403	}
404	if ($best_encoding) { # we extracted an encoding
405	$best_encoding =~ s/-+/_/g;
406	$best_encoding = lc($best_encoding); # lowercase
407	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
408	$found_html_encoding = 1;
409	# We shouldn't be modifying this here!!
410	#$self->{'input_encoding'} = $best_encoding;
411	}
412
413	# remove all HTML tags
414	$text =~ s/<[^>]*>//sg;
415	}
416
417	# don't need to do textcat if we know the encoding now AND don't need to extract language
418	if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
419	$best_language = $self->{'default_language'};
420	}
421
422	else { # need to use textcat to get either the language, or get both language and encoding
423	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
424
425	if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
426	my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
427
428	my $language;
429	($language) = $results->[0] =~ m/^([^-])(?:-(?:.))?$/ if (scalar @$results > 0);
430
431	if (!defined $language \|\| scalar @$results > 3) {
432	# if there were too many results even when restricting results by encoding,
433	# or if there were no results, use default language with the known encoding
434	$best_language = $self->use_default_language($filename);
435	}
436	else { # fewer than 3 results means textcat is more certain, use the first result
437	$best_language = $language;
438	}
439	}
440	else { # don't know encoding or language yet, therefore we use textcat
441	my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
442
443	# if textcat returns 3 or less possibilities we'll use the first one in the list
444	if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
445	my ($language, $encoding) = $results->[0] =~ m/^([^-])(?:-(.))?$/;
446
447	$language = $self->use_default_language($filename) unless defined $language;
448	$encoding = $self->use_default_encoding($filename) unless defined $encoding;
449
450	$best_language = $language;
451	$best_encoding = $encoding;
452	}
453	else { # if (scalar @$results > 3) {
454	if ($unicode_format) { # in case the first had a BOM
455	$best_encoding=$unicode_format;
456	}
457	else {
458	# Find the most frequent encoding in the textcat results returned
459	# Returns "" if there's no encoding more frequent than another
460	$best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
461	}
462
463	if ($best_encoding eq "") { # encoding still not set, use defaults
464	$best_language = $self->use_default_language($filename);
465	$best_encoding = $self->use_default_encoding($filename);
466	}
467	elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
468	$best_language = $self->use_default_language($filename);
469	}
470	else { # textcat again using the most frequent encoding or the $unicode_format set above
471	$results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
472	my $language;
473	($language) = $results->[0] =~ m/^([^-])(?:-(.))?$/ if (scalar @$results > 0);
474	if (!defined $language \|\| scalar @$results > 3) {
475	# if no result or too many results, use default language for the encoding previously found
476	$best_language = $self->use_default_language($filename);
477	}
478	else { # fewer than 3 results, use the language of the first result
479	$best_language = $language;
480	}
481	}
482	}
483	}
484	}
485
486	if($best_encoding eq "" \|\| $best_language eq "") {
487	print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
488	$best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
489	$best_language = $self->use_default_language($filename) if $best_language eq "";
490	}
491	# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
492
493
494	if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
495	# the text is valid utf8, so assume that's the real encoding
496	# (since textcat is based on probabilities)
497	$best_encoding = 'utf8';
498	}
499
500	# check for equivalents where textcat doesn't have some encodings...
501	# eg MS versions of standard encodings
502	if ($best_encoding =~ /^iso_8859_(\d+)/) {
503	my $iso = $1; # which variant of the iso standard?
504	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
505	if ($text =~ /[\x80-\x9f]/) {
506	# Western Europe
507	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
508	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
509	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
510	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
511	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
512	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
513	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
514	}
515	}
516
517	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
518	!defined $encodings::encodings->{$best_encoding}) {
519	if ($self->{'verbosity'}) {
520	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
521	$filename, $best_encoding, $self->{'default_encoding'});
522	}
523	$best_encoding = $self->{'default_encoding'};
524	}
525
526	return ($best_language, $best_encoding);
527	}
528
529
530	sub use_default_language {
531	my $self = shift (@_);
532	my ($filename) = @_;
533
534	if ($self->{'verbosity'}>2) {
535	gsprintf($self->{'outhandle'},
536	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
537	$filename, $self->{'default_language'});
538	}
539	return $self->{'default_language'};
540	}
541
542	sub use_default_encoding {
543	my $self = shift (@_);
544	my ($filename) = @_;
545
546	if ($self->{'verbosity'}>2) {
547	gsprintf($self->{'outhandle'},
548	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
549	$filename, $self->{'default_encoding'});
550	}
551	return $self->{'default_encoding'};
552	}
553
554	# Overridden by exploding plugins (eg. ISISPlug)
555	sub clean_up_after_exploding
556	{
557	my $self = shift(@_);
558	}
559
560
561	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: