Context Navigation

source: gsdl/trunk/perllib/plugins/ReadTextFile.pm@ 16822

Last change on this file since 16822 was 16765, checked in by ak19, 16 years ago
Only removes comments in head tag now when working out the encoding
Property svn:executable set to ``*
File size: 16.2 KB

Line
1	###########################################################################
2	#
3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package ReadTextFile;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32
33	use multiread;
34	use encodings;
35	use unicode;
36	use textcat;
37	use doc;
38	use ghtml;
39	use gsprintf 'gsprintf';
40
41	use AutoExtractMetadata;
42
43	sub BEGIN {
44	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
45	}
46
47	my $encoding_plus_auto_list =
48	[ { 'name' => "auto",
49	'desc' => "{ReadTextFile.input_encoding.auto}" } ];
50	push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
51
52	my $arguments =
53	[ { 'name' => "input_encoding",
54	'desc' => "{ReadTextFile.input_encoding}",
55	'type' => "enum",
56	'list' => $encoding_plus_auto_list,
57	'reqd' => "no" ,
58	'deft' => "auto" } ,
59	{ 'name' => "default_encoding",
60	'desc' => "{ReadTextFile.default_encoding}",
61	'type' => "enum",
62	'list' => $BasePlugin::encoding_list,
63	'reqd' => "no",
64	'deft' => "utf8" },
65	{ 'name' => "extract_language",
66	'desc' => "{ReadTextFile.extract_language}",
67	'type' => "flag",
68	'reqd' => "no" },
69	{ 'name' => "default_language",
70	'desc' => "{ReadTextFile.default_language}",
71	'type' => "string",
72	'deft' => "en",
73	'reqd' => "no" }
74	];
75
76
77	my $options = { 'name' => "ReadTextFile",
78	'desc' => "{ReadTextFile.desc}",
79	'abstract' => "yes",
80	'inherits' => "no",
81	'args' => $arguments };
82
83
84
85	sub new {
86	my $class = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
94
95	return bless $self, $class;
96
97	}
98
99
100
101	# The ReadTextFile read_into_doc_obj() function. This function does all the
102	# right things to make general options work for a given plugin. It reads in
103	# a file and sets up a slew of metadata all saved in doc_obj, which
104	# it then returns as part of a tuple (process_status,doc_obj)
105	#
106	# Much of this functionality used to reside in read, but it was broken
107	# down into a supporting routine to make the code more flexible.
108	#
109	# recursive plugins (e.g. RecPlug) and specialized plugins like those
110	# capable of processing many documents within a single file (e.g.
111	# GMLPlug) will normally want to implement their own version of
112	# read_into_doc_obj()
113	#
114	# Note that $base_dir might be "" and that $file might
115	# include directories
116	sub read_into_doc_obj {
117	my $self = shift (@_);
118	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
119
120	my $outhandle = $self->{'outhandle'};
121	# should we move this to read? What about secondary plugins?
122	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
123	print $outhandle "$self->{'plugin_type'} processing $file\n"
124	if $self->{'verbosity'} > 1;
125
126	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
127
128	# Do encoding stuff
129	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
130	if ($self->{'verbosity'} > 2) {
131	print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n";
132	}
133
134	# create a new document
135	my $doc_obj = new doc ($filename_full_path, "indexed_doc");
136	my $top_section = $doc_obj->get_top_section();
137
138	# this should look at the plugin option too...
139	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
140	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
141	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
142	$self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
143
144	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
145	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
146
147	# read in file ($text will be in utf8)
148	my $text = "";
149	$self->read_file ($filename_full_path, $encoding, $language, \$text);
150
151	if (!length ($text)) {
152	if ($gli) {
153	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
154	}
155	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
156
157	my $failhandle = $self->{'failhandle'};
158	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
159	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
160	$self->{'num_not_processed'} ++;
161
162	return (0,undef); # what should we return here?? error but don't want to pass it on
163	}
164
165	# do plugin specific processing of doc_obj
166	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
167	$text = '';
168	undef $text;
169	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
170	return (-1,undef);
171	}
172	$text='';
173	undef $text;
174
175	# include any metadata passed in from previous plugins
176	# note that this metadata is associated with the top level section
177	$self->add_associated_files($doc_obj, $filename_full_path);
178	$self->extra_metadata ($doc_obj, $top_section, $metadata);
179
180	# do any automatic metadata extraction
181	$self->auto_extract_metadata ($doc_obj);
182
183
184	# if we haven't found any Title so far, assign one
185	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
186
187	$self->add_OID($doc_obj);
188
189	return (1,$doc_obj);
190	}
191
192	# uses the multiread package to read in the entire file pointed to
193	# by filename and loads the resulting text into $$textref. Input text
194	# may be in any of the encodings handled by multiread, output text
195	# will be in utf8
196	sub read_file {
197	my $self = shift (@_);
198	my ($filename, $encoding, $language, $textref) = @_;
199
200	if (!-r $filename)
201	{
202	my $outhandle = $self->{'outhandle'};
203	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
204	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
205	return;
206	}
207	$$textref = "";
208	if (!open (FILE, $filename)) {
209	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
210	die "\n";
211	}
212
213	if ($encoding eq "ascii") {
214	undef $/;
215	$$textref = <FILE>;
216	$/ = "\n";
217	} else {
218	my $reader = new multiread();
219	$reader->set_handle ('ReadTextFile::FILE');
220	$reader->set_encoding ($encoding);
221	$reader->read_file ($textref);
222	}
223	close FILE;
224	}
225
226
227	sub read_file_no_decoding {
228	my $self = shift (@_);
229	my ($filename, $textref) = @_;
230
231	if (!-r $filename)
232	{
233	my $outhandle = $self->{'outhandle'};
234	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
235	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
236	return;
237	}
238	$$textref = "";
239	if (!open (FILE, $filename)) {
240	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
241	die "\n";
242	}
243
244	my $reader = new multiread();
245	$reader->set_handle ('ReadTextFile::FILE');
246	$reader->read_file_no_decoding ($textref);
247
248	$self->{'reader'} = $reader;
249
250	close FILE;
251	}
252
253
254	sub decode_text {
255	my $self = shift (@_);
256	my ($raw_text, $encoding, $language, $textref) = @_;
257
258	my $reader = $self->{'reader'};
259	if (!defined $reader) {
260	gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
261	}
262	else {
263	$reader->set_encoding($encoding);
264	$reader->decode_text($raw_text,$textref);
265	}
266	}
267
268
269	sub textcat_get_language_encoding {
270	my $self = shift (@_);
271	my ($filename) = @_;
272
273	my ($language, $encoding, $extracted_encoding);
274	if ($self->{'input_encoding'} eq "auto") {
275	# use textcat to automatically work out the input encoding and language
276	($language, $encoding) = $self->get_language_encoding ($filename);
277	} elsif ($self->{'extract_language'}) {
278	# use textcat to get language metadata
279	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
280	$encoding = $self->{'input_encoding'};
281	# don't print this message for english... english in utf8 is identical
282	# to english in iso-8859-1 (except for some punctuation). We don't have
283	# a language model for en_utf8, so textcat always says iso-8859-1!
284	if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
285	my $plugin_name = ref ($self);
286	my $outhandle = $self->{'outhandle'};
287	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
288	}
289	} else {
290	$language = $self->{'default_language'};
291	$encoding = $self->{'input_encoding'};
292	}
293
294	# print STDERR "** language encoding of contents of file $filename:\n\t**$language $encoding\n";
295
296	return ($language, $encoding);
297	}
298
299	# Uses textcat to work out the encoding and language of the text in
300	# $filename. All html tags are removed before processing.
301	# returns an array containing "language" and "encoding"
302	sub get_language_encoding {
303	my $self = shift (@_);
304	my ($filename) = @_;
305	my $outhandle = $self->{'outhandle'};
306	my $unicode_format = "";
307	my $best_language = "";
308	my $best_encoding = "";
309
310	# read in file
311	if (!open (FILE, $filename)) {
312	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
313	# this is a pretty bad error, but try to continue anyway
314	return ($self->{'default_language'}, $self->{'input_encoding'});
315	}
316	undef $/;
317	my $text = <FILE>;
318	$/ = "\n";
319	close FILE;
320
321	# check if first few bytes have a Byte Order Marker
322	my $bom=substr($text,0,2); # check 16bit unicode
323	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
324	$unicode_format="unicode";
325	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
326	$unicode_format="unicode";
327	} else {
328	$bom=substr($text,0,3); # check utf-8
329	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
330	$unicode_format="utf8";
331	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
332	# $unicode_format="utf8";
333	}
334	}
335
336	my $found_html_encoding = 0;
337	# handle html files specially
338	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
339	if (ref($self) eq 'HTMLPlugin' \|\|
340	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
341
342	# remove comments in head, including multiline ones, so that we don't match on
343	# inactive tags (those that are nested inside comments)
344	my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
345	$head =~ s/<!--.*?-->//sg;
346
347	# remove <title>stuff</title> -- as titles tend often to be in English
348	# for foreign language documents
349	$text =~ s!<title>.*?</title>!!si;
350
351	# see if this html file specifies its encoding
352	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
353	$best_encoding = $1;
354	}
355	# check the meta http-equiv charset tag
356	elsif ($head =~ m/<meta http-equiv.content-type.charset=(.+?)\"/si) {
357	$best_encoding = $1;
358	# print STDERR "**** meta tag found, encoding is: $best_encoding\n";
359	}
360	if ($best_encoding) { # we extracted an encoding
361	$best_encoding =~ s/-+/_/g;
362	$best_encoding = lc($best_encoding); # lowercase
363	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
364	$found_html_encoding = 1;
365	# We shouldn't be modifying this here!!
366	#$self->{'input_encoding'} = $best_encoding;
367	}
368
369	# remove all HTML tags
370	$text =~ s/<[^>]*>//sg;
371	}
372
373	# get the language/encoding
374	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
375	# my $results = $self->{'textcat'}->classify(\$text);
376	my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename);
377
378	# if textcat returns 3 or less possibilities we'll use the
379	# first one in the list - otherwise use the defaults
380	if (scalar @$results > 3) {
381	if ($unicode_format) { # in case the first had a BOM
382	$best_encoding=$unicode_format;
383	} else {
384	my %guessed_encodings = ();
385	foreach my $result (@$results) {
386	$result =~ /([^\-]+)$/;
387	my $enc=$1;
388	if (!defined($guessed_encodings{$enc})) {
389	$guessed_encodings{$enc}=0;
390	}
391	$guessed_encodings{$enc}++;
392	}
393
394	$guessed_encodings{""}=-1; # for default best_encoding of ""
395	foreach my $enc (keys %guessed_encodings) {
396	if ($guessed_encodings{$enc} >
397	$guessed_encodings{$best_encoding}){
398	$best_encoding=$enc;
399	}
400	}
401	}
402
403	if ($self->{'input_encoding'} ne 'auto') {
404	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
405	gsprintf($outhandle,
406	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
407	$filename, $self->{'default_language'});
408	}
409	$best_language = $self->{'default_language'};
410	if (!$found_html_encoding) {
411	$best_encoding = $self->{'input_encoding'};
412	}
413
414	} else {
415	if ($self->{'verbosity'}>2) {
416	gsprintf($outhandle,
417	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
418	$filename, $self->{'default_language'});
419	}
420	$best_language = $self->{'default_language'};
421	}
422	} else { # <= 3 suggestions
423	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
424	if (!defined $language) {
425	if ($self->{'verbosity'}>2) {
426	gsprintf($outhandle,
427	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
428	$filename, $self->{'default_language'});
429	}
430	$language = $self->{'default_language'};
431	}
432	if (!defined $encoding) {
433	if ($self->{'verbosity'}>2) {
434	gsprintf($outhandle,
435	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
436	$filename, $self->{'default_encoding'});
437	}
438	$encoding = $self->{'default_encoding'};
439	}
440	$best_language = $language;
441	if (! $best_encoding ) { # may already be set... eg from html meta tag
442	$best_encoding = $encoding;
443	}
444	}
445
446	if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
447	# the text is valid utf8, so assume that's the real encoding
448	# (since textcat is based on probabilities)
449	$best_encoding = 'utf8';
450	}
451
452	# check for equivalents where textcat doesn't have some encodings...
453	# eg MS versions of standard encodings
454	if ($best_encoding =~ /^iso_8859_(\d+)/) {
455	my $iso = $1; # which variant of the iso standard?
456	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
457	if ($text =~ /[\x80-\x9f]/) {
458	# Western Europe
459	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
460	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
461	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
462	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
463	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
464	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
465	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
466	}
467	}
468
469	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
470	!defined $encodings::encodings->{$best_encoding}) {
471	if ($self->{'verbosity'}) {
472	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
473	$filename, $best_encoding, $self->{'default_encoding'});
474	}
475	$best_encoding = $self->{'default_encoding'};
476	}
477
478	return ($best_language, $best_encoding);
479	}
480
481
482
483	# Overridden by exploding plugins (eg. ISISPlug)
484	sub clean_up_after_exploding
485	{
486	my $self = shift(@_);
487	}
488
489
490	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: