Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/ReadTextFile.pm@ 26963

Last change on this file since 26963 was 26963, checked in by jmt12, 11 years ago
Altered to use functions in util library when opening file handles to allow for files directly read from and written to HDFS (luckily HDFS supporting some piping)
Property svn:executable set to ``*
File size: 20.0 KB

Line
1	###########################################################################
2	#
3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package ReadTextFile;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32	use Encode;
33
34	use multiread;
35	use encodings;
36	use unicode;
37	use textcat;
38	use doc;
39	use ghtml;
40	use gsprintf 'gsprintf';
41
42	use AutoExtractMetadata;
43
44	sub BEGIN {
45	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
46	}
47
48	my $encoding_plus_auto_list =
49	[ { 'name' => "auto",
50	'desc' => "{ReadTextFile.input_encoding.auto}" } ];
51	push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
52
53	my $arguments =
54	[ { 'name' => "input_encoding",
55	'desc' => "{ReadTextFile.input_encoding}",
56	'type' => "enum",
57	'list' => $encoding_plus_auto_list,
58	'reqd' => "no" ,
59	'deft' => "auto" } ,
60	{ 'name' => "default_encoding",
61	'desc' => "{ReadTextFile.default_encoding}",
62	'type' => "enum",
63	'list' => $BasePlugin::encoding_list,
64	'reqd' => "no",
65	'deft' => "utf8" },
66	{ 'name' => "extract_language",
67	'desc' => "{ReadTextFile.extract_language}",
68	'type' => "flag",
69	'reqd' => "no" },
70	{ 'name' => "default_language",
71	'desc' => "{ReadTextFile.default_language}",
72	'type' => "string",
73	'deft' => "en",
74	'reqd' => "no" }
75	];
76
77
78	my $options = { 'name' => "ReadTextFile",
79	'desc' => "{ReadTextFile.desc}",
80	'abstract' => "yes",
81	'inherits' => "no",
82	'args' => $arguments };
83
84
85
86	sub new {
87	my $class = shift (@_);
88	my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
89	push(@$pluginlist, $class);
90
91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
92	push(@{$hashArgOptLists->{"OptList"}},$options);
93
94	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
95
96	return bless $self, $class;
97
98	}
99
100
101
102	# The ReadTextFile read_into_doc_obj() function. This function does all the
103	# right things to make general options work for a given plugin. It reads in
104	# a file and sets up a slew of metadata all saved in doc_obj, which
105	# it then returns as part of a tuple (process_status,doc_obj)
106	#
107	# Much of this functionality used to reside in read, but it was broken
108	# down into a supporting routine to make the code more flexible.
109	#
110	# recursive plugins (e.g. RecPlug) and specialized plugins like those
111	# capable of processing many documents within a single file (e.g.
112	# GMLPlug) will normally want to implement their own version of
113	# read_into_doc_obj()
114	#
115	# Note that $base_dir might be "" and that $file might
116	# include directories
117	sub read_into_doc_obj {
118	my $self = shift (@_);
119	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
120
121	my $outhandle = $self->{'outhandle'};
122	# should we move this to read? What about secondary plugins?
123	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
124	my $pp_file = &util::prettyprint_file($base_dir,$file,$gli);
125	print $outhandle "$self->{'plugin_type'} processing $pp_file\n"
126	if $self->{'verbosity'} > 1;
127
128	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
129
130	# Do encoding stuff
131	my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
132	if ($self->{'verbosity'} > 2) {
133	print $outhandle "ReadTextFile: reading $file as ($content_encoding,$language)\n";
134	}
135
136	# create a new document
137	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
138	my $top_section = $doc_obj->get_top_section();
139
140	# this should look at the plugin option too...
141	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
142	$doc_obj->add_utf8_metadata($top_section, "FileSize", &util::file_size($filename_full_path));
143
144	my $plugin_filename_encoding = $self->{'filename_encoding'};
145	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
146	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
147
148	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
149	$doc_obj->add_utf8_metadata($top_section, "Encoding", $content_encoding);
150
151	# read in file ($text will be in utf8)
152	my $text = "";
153	$self->read_file ($filename_full_path, $content_encoding, $language, \$text);
154
155	if (!length ($text)) {
156	if ($gli) {
157	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
158	}
159	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
160
161	my $failhandle = $self->{'failhandle'};
162	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
163	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
164	$self->{'num_not_processed'} ++;
165
166	return (0,undef); # what should we return here?? error but don't want to pass it on
167	}
168
169	# do plugin specific processing of doc_obj
170	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
171	$text = '';
172	undef $text;
173	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
174	return (-1,undef);
175	}
176	$text='';
177	undef $text;
178
179	# include any metadata passed in from previous plugins
180	# note that this metadata is associated with the top level section
181	$self->add_associated_files($doc_obj, $filename_full_path);
182	$self->extra_metadata ($doc_obj, $top_section, $metadata);
183
184	# do any automatic metadata extraction
185	$self->auto_extract_metadata ($doc_obj);
186
187
188	# if we haven't found any Title so far, assign one
189	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
190
191	$self->add_OID($doc_obj);
192
193	return (1,$doc_obj);
194	}
195
196	# uses the multiread package to read in the entire file pointed to
197	# by filename and loads the resulting text into $$textref. Input text
198	# may be in any of the encodings handled by multiread, output text
199	# will be in utf8
200	sub read_file {
201	my $self = shift (@_);
202	my ($filename, $encoding, $language, $textref) = @_;
203
204	if (!&util::file_canread($filename))
205	{
206	my $outhandle = $self->{'outhandle'};
207	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
208	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
209	return;
210	}
211
212	$$textref = "";
213	# use the util::file_* functions to provide some extra smarts to the open
214	# command in order to support protocols like http: or hdfs:
215	if (!open (FILE, &util::file_openfdcommand($filename, '<')))
216	{
217	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
218	die "\n";
219	}
220
221	if ($encoding eq "ascii")
222	{
223	# fast file 'slurp' implementation
224	sysread(FILE, $$textref, &util::file_size($filename));
225	# the old slow way of reading in a file
226	#undef $/;
227	#$$textref = <FILE>;
228	#$/ = "\n";
229	}
230	else
231	{
232	my $reader = new multiread();
233	$reader->set_handle ('ReadTextFile::FILE');
234	$reader->set_encoding ($encoding);
235	$reader->read_file ($textref);
236	}
237
238	# At this point $$textref is a binary byte string
239	# => turn it into a Unicode aware string, so full
240	# Unicode aware pattern matching can be used.
241	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
242	#
243	$$textref = decode("utf8", $$textref);
244
245	close FILE;
246	}
247
248
249	# Not currently used
250	sub UNUSED_read_file_usingPerlsEncodeModule {
251	##sub read_file {
252	my $self = shift (@_);
253	my ($filename, $encoding, $language, $textref) = @_;
254
255	if (!-r $filename)
256	{
257	my $outhandle = $self->{'outhandle'};
258	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
259	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
260	return;
261	}
262	$$textref = "";
263	if (!open (FILE, $filename)) {
264	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
265	or_reading} ($!)\n", $filename);
266	die "\n";
267	}
268
269	my $store_slash = $/;
270	undef $/;
271	my $text = <FILE>;
272	$/ = $store_slash;
273
274	$$textref = decode($encoding,$text);
275
276	close FILE;
277	}
278
279
280	sub read_file_no_decoding {
281	my $self = shift (@_);
282	my ($filename, $textref) = @_;
283
284	if (!-r $filename)
285	{
286	my $outhandle = $self->{'outhandle'};
287	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
288	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
289	return;
290	}
291	$$textref = "";
292	if (!open (FILE, $filename)) {
293	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
294	die "\n";
295	}
296
297	my $reader = new multiread();
298	$reader->set_handle ('ReadTextFile::FILE');
299	$reader->read_file_no_decoding ($textref);
300
301	$self->{'reader'} = $reader;
302
303	close FILE;
304	}
305
306
307	sub decode_text {
308	my $self = shift (@_);
309	my ($raw_text, $encoding, $language, $textref) = @_;
310
311	my $reader = $self->{'reader'};
312	if (!defined $reader) {
313	gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
314	}
315	else {
316	$reader->set_encoding($encoding);
317	$reader->decode_text($raw_text,$textref);
318
319	# At this point $$textref is a binary byte string
320	# => turn it into a Unicode aware string, so full
321	# Unicode aware pattern matching can be used.
322	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
323
324	$$textref = decode("utf8",$$textref);
325	}
326	}
327
328
329	sub textcat_get_language_encoding {
330	my $self = shift (@_);
331	my ($filename) = @_;
332
333	my ($language, $encoding, $extracted_encoding);
334	if ($self->{'input_encoding'} eq "auto") {
335	# use textcat to automatically work out the input encoding and language
336	($language, $encoding) = $self->get_language_encoding ($filename);
337	} elsif ($self->{'extract_language'}) {
338	# use textcat to get language metadata
339	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
340	$encoding = $self->{'input_encoding'};
341	# don't print this message for english... english in utf8 is identical
342	# to english in iso-8859-1 (except for some punctuation). We don't have
343	# a language model for en_utf8, so textcat always says iso-8859-1!
344	if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
345	my $plugin_name = ref ($self);
346	my $outhandle = $self->{'outhandle'};
347	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
348	}
349	} else {
350	$language = $self->{'default_language'};
351	$encoding = $self->{'input_encoding'};
352	}
353
354	# print STDERR "** language encoding of contents of file $filename:\n\t**$language $encoding\n";
355
356	return ($language, $encoding);
357	}
358
359
360	# Uses textcat to work out the encoding and language of the text in
361	# $filename. All html tags are removed before processing.
362	# returns an array containing "language" and "encoding"
363	sub get_language_encoding {
364	my $self = shift (@_);
365	my ($filename) = @_;
366	my $outhandle = $self->{'outhandle'};
367	my $unicode_format = "";
368	my $best_language = "";
369	my $best_encoding = "";
370
371
372	# read in file
373	if (!open (FILE, &util::file_openfdcommand($filename, '<'))) {
374	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
375	# this is a pretty bad error, but try to continue anyway
376	return ($self->{'default_language'}, $self->{'input_encoding'});
377	}
378	my $text;
379	sysread(FILE, $text, &util::file_size($filename));
380	#undef $/;
381	#my $text = <FILE>;
382	#$/ = "\n";
383	close FILE;
384
385	# check if first few bytes have a Byte Order Marker
386	my $bom=substr($text,0,2); # check 16bit unicode
387	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
388	$unicode_format="unicode";
389	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
390	$unicode_format="unicode";
391	} else {
392	$bom=substr($text,0,3); # check utf-8
393	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
394	$unicode_format="utf8";
395	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
396	# $unicode_format="utf8";
397	}
398	}
399
400	my $found_html_encoding = 0;
401	# handle html files specially
402	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
403	if (ref($self) eq 'HTMLPlugin' \|\|
404	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
405
406	# remove comments in head, including multiline ones, so that we don't match on
407	# inactive tags (those that are nested inside comments)
408	my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
409	$head = "" unless defined $head; # some files are not proper HTML eg php files
410	$head =~ s/<!--.*?-->//sg;
411
412	# remove <title>stuff</title> -- as titles tend often to be in English
413	# for foreign language documents
414	$text =~ s!<title>.*?</title>!!si;
415
416	# see if this html file specifies its encoding
417	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
418	$best_encoding = $1;
419	}
420	# check the meta http-equiv charset tag
421	elsif ($head =~ m/<meta http-equiv.content-type.charset=(.+?)\"/si) {
422	$best_encoding = $1;
423	}
424	if ($best_encoding) { # we extracted an encoding
425	$best_encoding =~ s/-+/_/g;
426	$best_encoding = lc($best_encoding); # lowercase
427	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
428	$found_html_encoding = 1;
429	# We shouldn't be modifying this here!!
430	#$self->{'input_encoding'} = $best_encoding;
431	}
432
433	# remove all HTML tags
434	$text =~ s/<[^>]*>//sg;
435	}
436
437	# don't need to do textcat if we know the encoding now AND don't need to extract language
438	if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
439	$best_language = $self->{'default_language'};
440	}
441
442	else { # need to use textcat to get either the language, or get both language and encoding
443	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
444
445	if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
446	my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
447
448	my $language;
449	($language) = $results->[0] =~ m/^([^-])(?:-(?:.))?$/ if (scalar @$results > 0);
450
451	if (!defined $language \|\| scalar @$results > 3) {
452	# if there were too many results even when restricting results by encoding,
453	# or if there were no results, use default language with the known encoding
454	$best_language = $self->use_default_language($filename);
455	}
456	else { # fewer than 3 results means textcat is more certain, use the first result
457	$best_language = $language;
458	}
459	}
460	else { # don't know encoding or language yet, therefore we use textcat
461	my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
462
463	# if textcat returns 3 or less possibilities we'll use the first one in the list
464	if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
465	my ($language, $encoding) = $results->[0] =~ m/^([^-])(?:-(.))?$/;
466
467	$language = $self->use_default_language($filename) unless defined $language;
468	$encoding = $self->use_default_encoding($filename) unless defined $encoding;
469
470	$best_language = $language;
471	$best_encoding = $encoding;
472	}
473	else { # if (scalar @$results > 3) {
474	if ($unicode_format) { # in case the first had a BOM
475	$best_encoding=$unicode_format;
476	}
477	else {
478	# Find the most frequent encoding in the textcat results returned
479	# Returns "" if there's no encoding more frequent than another
480	$best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
481	}
482
483	if ($best_encoding eq "") { # encoding still not set, use defaults
484	$best_language = $self->use_default_language($filename);
485	$best_encoding = $self->use_default_encoding($filename);
486	}
487	elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
488	$best_language = $self->use_default_language($filename);
489	}
490	else { # textcat again using the most frequent encoding or the $unicode_format set above
491	$results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
492	my $language;
493	($language) = $results->[0] =~ m/^([^-])(?:-(.))?$/ if (scalar @$results > 0);
494	if (!defined $language \|\| scalar @$results > 3) {
495	# if no result or too many results, use default language for the encoding previously found
496	$best_language = $self->use_default_language($filename);
497	}
498	else { # fewer than 3 results, use the language of the first result
499	$best_language = $language;
500	}
501	}
502	}
503	}
504	}
505
506	if($best_encoding eq "" \|\| $best_language eq "") {
507	print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
508	$best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
509	$best_language = $self->use_default_language($filename) if $best_language eq "";
510	}
511	# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
512
513
514	if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
515	# the text is valid utf8, so assume that's the real encoding
516	# (since textcat is based on probabilities)
517	$best_encoding = 'utf8';
518	}
519
520	# check for equivalents where textcat doesn't have some encodings...
521	# eg MS versions of standard encodings
522	if ($best_encoding =~ /^iso_8859_(\d+)/) {
523	my $iso = $1; # which variant of the iso standard?
524	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
525	if ($text =~ /[\x80-\x9f]/) {
526	# Western Europe
527	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
528	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
529	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
530	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
531	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
532	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
533	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
534	}
535	}
536
537	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
538	!defined $encodings::encodings->{$best_encoding}) {
539	if ($self->{'verbosity'}) {
540	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
541	$filename, $best_encoding, $self->{'default_encoding'});
542	}
543	$best_encoding = $self->{'default_encoding'};
544	}
545
546	return ($best_language, $best_encoding);
547	}
548
549
550	sub use_default_language {
551	my $self = shift (@_);
552	my ($filename) = @_;
553
554	if ($self->{'verbosity'}>2) {
555	gsprintf($self->{'outhandle'},
556	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
557	$filename, $self->{'default_language'});
558	}
559	return $self->{'default_language'};
560	}
561
562	sub use_default_encoding {
563	my $self = shift (@_);
564	my ($filename) = @_;
565
566	if ($self->{'verbosity'}>2) {
567	gsprintf($self->{'outhandle'},
568	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
569	$filename, $self->{'default_encoding'});
570	}
571	return $self->{'default_encoding'};
572	}
573
574	# Overridden by exploding plugins (eg. ISISPlug)
575	sub clean_up_after_exploding
576	{
577	my $self = shift(@_);
578	}
579
580
581	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: