Context Navigation

source: gsdl/trunk/perllib/plugins/ReadTextFile.pm@ 18320

Last change on this file since 18320 was 18320, checked in by ak19, 15 years ago
Now plugins provide the option of base64 encoding or url encoding filenames that are to be renamed (when copied into the archives dir). Previously renamed files would always be url-encoded. URL-encoding is the default now for most plugins except MP3Plugin and OggVorbisPlugin, where the default is base64 encoding. Base64 encoding filenames upon renaming them was introduced so that more files that browsers try to open in external applications can open them, since url encoding does not seem to be implemented the same everywhere (for instance, windows media player is unable to handle url-encoded wmv filenames when such files are launched in it through the browser).
Property svn:executable set to ``*
File size: 17.9 KB

Line
1	###########################################################################
2	#
3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package ReadTextFile;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32
33	use multiread;
34	use encodings;
35	use unicode;
36	use textcat;
37	use doc;
38	use ghtml;
39	use gsprintf 'gsprintf';
40
41	use AutoExtractMetadata;
42
43	sub BEGIN {
44	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
45	}
46
47	my $encoding_plus_auto_list =
48	[ { 'name' => "auto",
49	'desc' => "{ReadTextFile.input_encoding.auto}" } ];
50	push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
51
52	my $arguments =
53	[ { 'name' => "input_encoding",
54	'desc' => "{ReadTextFile.input_encoding}",
55	'type' => "enum",
56	'list' => $encoding_plus_auto_list,
57	'reqd' => "no" ,
58	'deft' => "auto" } ,
59	{ 'name' => "default_encoding",
60	'desc' => "{ReadTextFile.default_encoding}",
61	'type' => "enum",
62	'list' => $BasePlugin::encoding_list,
63	'reqd' => "no",
64	'deft' => "utf8" },
65	{ 'name' => "extract_language",
66	'desc' => "{ReadTextFile.extract_language}",
67	'type' => "flag",
68	'reqd' => "no" },
69	{ 'name' => "default_language",
70	'desc' => "{ReadTextFile.default_language}",
71	'type' => "string",
72	'deft' => "en",
73	'reqd' => "no" }
74	];
75
76
77	my $options = { 'name' => "ReadTextFile",
78	'desc' => "{ReadTextFile.desc}",
79	'abstract' => "yes",
80	'inherits' => "no",
81	'args' => $arguments };
82
83
84
85	sub new {
86	my $class = shift (@_);
87	my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
88	push(@$pluginlist, $class);
89
90	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
91	push(@{$hashArgOptLists->{"OptList"}},$options);
92
93	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
94
95	return bless $self, $class;
96
97	}
98
99
100
101	# The ReadTextFile read_into_doc_obj() function. This function does all the
102	# right things to make general options work for a given plugin. It reads in
103	# a file and sets up a slew of metadata all saved in doc_obj, which
104	# it then returns as part of a tuple (process_status,doc_obj)
105	#
106	# Much of this functionality used to reside in read, but it was broken
107	# down into a supporting routine to make the code more flexible.
108	#
109	# recursive plugins (e.g. RecPlug) and specialized plugins like those
110	# capable of processing many documents within a single file (e.g.
111	# GMLPlug) will normally want to implement their own version of
112	# read_into_doc_obj()
113	#
114	# Note that $base_dir might be "" and that $file might
115	# include directories
116	sub read_into_doc_obj {
117	my $self = shift (@_);
118	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
119
120	my $outhandle = $self->{'outhandle'};
121	# should we move this to read? What about secondary plugins?
122	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
123	print $outhandle "$self->{'plugin_type'} processing $file\n"
124	if $self->{'verbosity'} > 1;
125
126	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
127
128	# Do encoding stuff
129	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
130	if ($self->{'verbosity'} > 2) {
131	print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n";
132	}
133
134	# create a new document
135	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
136	my $top_section = $doc_obj->get_top_section();
137
138	# this should look at the plugin option too...
139	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
140	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
141	$self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
142
143	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
144	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
145
146	# read in file ($text will be in utf8)
147	my $text = "";
148	$self->read_file ($filename_full_path, $encoding, $language, \$text);
149
150	if (!length ($text)) {
151	if ($gli) {
152	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
153	}
154	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
155
156	my $failhandle = $self->{'failhandle'};
157	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
158	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
159	$self->{'num_not_processed'} ++;
160
161	return (0,undef); # what should we return here?? error but don't want to pass it on
162	}
163
164	# do plugin specific processing of doc_obj
165	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
166	$text = '';
167	undef $text;
168	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
169	return (-1,undef);
170	}
171	$text='';
172	undef $text;
173
174	# include any metadata passed in from previous plugins
175	# note that this metadata is associated with the top level section
176	$self->add_associated_files($doc_obj, $filename_full_path);
177	$self->extra_metadata ($doc_obj, $top_section, $metadata);
178
179	# do any automatic metadata extraction
180	$self->auto_extract_metadata ($doc_obj);
181
182
183	# if we haven't found any Title so far, assign one
184	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
185
186	$self->add_OID($doc_obj);
187
188	return (1,$doc_obj);
189	}
190
191	# uses the multiread package to read in the entire file pointed to
192	# by filename and loads the resulting text into $$textref. Input text
193	# may be in any of the encodings handled by multiread, output text
194	# will be in utf8
195	sub read_file {
196	my $self = shift (@_);
197	my ($filename, $encoding, $language, $textref) = @_;
198
199	if (!-r $filename)
200	{
201	my $outhandle = $self->{'outhandle'};
202	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
203	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
204	return;
205	}
206	$$textref = "";
207	if (!open (FILE, $filename)) {
208	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
209	die "\n";
210	}
211
212	if ($encoding eq "ascii") {
213	undef $/;
214	$$textref = <FILE>;
215	$/ = "\n";
216	} else {
217	my $reader = new multiread();
218	$reader->set_handle ('ReadTextFile::FILE');
219	$reader->set_encoding ($encoding);
220	$reader->read_file ($textref);
221	}
222	close FILE;
223	}
224
225
226	sub read_file_no_decoding {
227	my $self = shift (@_);
228	my ($filename, $textref) = @_;
229
230	if (!-r $filename)
231	{
232	my $outhandle = $self->{'outhandle'};
233	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
234	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
235	return;
236	}
237	$$textref = "";
238	if (!open (FILE, $filename)) {
239	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
240	die "\n";
241	}
242
243	my $reader = new multiread();
244	$reader->set_handle ('ReadTextFile::FILE');
245	$reader->read_file_no_decoding ($textref);
246
247	$self->{'reader'} = $reader;
248
249	close FILE;
250	}
251
252
253	sub decode_text {
254	my $self = shift (@_);
255	my ($raw_text, $encoding, $language, $textref) = @_;
256
257	my $reader = $self->{'reader'};
258	if (!defined $reader) {
259	gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
260	}
261	else {
262	$reader->set_encoding($encoding);
263	$reader->decode_text($raw_text,$textref);
264	}
265	}
266
267
268	sub textcat_get_language_encoding {
269	my $self = shift (@_);
270	my ($filename) = @_;
271
272	my ($language, $encoding, $extracted_encoding);
273	if ($self->{'input_encoding'} eq "auto") {
274	# use textcat to automatically work out the input encoding and language
275	($language, $encoding) = $self->get_language_encoding ($filename);
276	} elsif ($self->{'extract_language'}) {
277	# use textcat to get language metadata
278	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
279	$encoding = $self->{'input_encoding'};
280	# don't print this message for english... english in utf8 is identical
281	# to english in iso-8859-1 (except for some punctuation). We don't have
282	# a language model for en_utf8, so textcat always says iso-8859-1!
283	if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
284	my $plugin_name = ref ($self);
285	my $outhandle = $self->{'outhandle'};
286	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
287	}
288	} else {
289	$language = $self->{'default_language'};
290	$encoding = $self->{'input_encoding'};
291	}
292
293	# print STDERR "** language encoding of contents of file $filename:\n\t**$language $encoding\n";
294
295	return ($language, $encoding);
296	}
297
298
299	# Uses textcat to work out the encoding and language of the text in
300	# $filename. All html tags are removed before processing.
301	# returns an array containing "language" and "encoding"
302	sub get_language_encoding {
303	my $self = shift (@_);
304	my ($filename) = @_;
305	my $outhandle = $self->{'outhandle'};
306	my $unicode_format = "";
307	my $best_language = "";
308	my $best_encoding = "";
309
310
311	# read in file
312	if (!open (FILE, $filename)) {
313	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
314	# this is a pretty bad error, but try to continue anyway
315	return ($self->{'default_language'}, $self->{'input_encoding'});
316	}
317	undef $/;
318	my $text = <FILE>;
319	$/ = "\n";
320	close FILE;
321
322	# check if first few bytes have a Byte Order Marker
323	my $bom=substr($text,0,2); # check 16bit unicode
324	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
325	$unicode_format="unicode";
326	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
327	$unicode_format="unicode";
328	} else {
329	$bom=substr($text,0,3); # check utf-8
330	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
331	$unicode_format="utf8";
332	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
333	# $unicode_format="utf8";
334	}
335	}
336
337	my $found_html_encoding = 0;
338	# handle html files specially
339	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
340	if (ref($self) eq 'HTMLPlugin' \|\|
341	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
342
343	# remove comments in head, including multiline ones, so that we don't match on
344	# inactive tags (those that are nested inside comments)
345	my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
346	$head = "" unless defined $head; # some files are not proper HTML eg php files
347	$head =~ s/<!--.*?-->//sg;
348
349	# remove <title>stuff</title> -- as titles tend often to be in English
350	# for foreign language documents
351	$text =~ s!<title>.*?</title>!!si;
352
353	# see if this html file specifies its encoding
354	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
355	$best_encoding = $1;
356	}
357	# check the meta http-equiv charset tag
358	elsif ($head =~ m/<meta http-equiv.content-type.charset=(.+?)\"/si) {
359	$best_encoding = $1;
360	}
361	if ($best_encoding) { # we extracted an encoding
362	$best_encoding =~ s/-+/_/g;
363	$best_encoding = lc($best_encoding); # lowercase
364	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
365	$found_html_encoding = 1;
366	# We shouldn't be modifying this here!!
367	#$self->{'input_encoding'} = $best_encoding;
368	}
369
370	# remove all HTML tags
371	$text =~ s/<[^>]*>//sg;
372	}
373
374	# don't need to do textcat if we know the encoding now AND don't need to extract language
375	if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
376	$best_language = $self->{'default_language'};
377	}
378
379	else { # need to use textcat to get either the language, or get both language and encoding
380	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
381
382	if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
383	my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
384
385	my $language;
386	($language) = $results->[0] =~ m/^([^-])(?:-(?:.))?$/ if (scalar @$results > 0);
387
388	if (!defined $language \|\| scalar @$results > 3) {
389	# if there were too many results even when restricting results by encoding,
390	# or if there were no results, use default language with the known encoding
391	$best_language = $self->use_default_language($filename);
392	}
393	else { # fewer than 3 results means textcat is more certain, use the first result
394	$best_language = $language;
395	}
396	}
397	else { # don't know encoding or language yet, therefore we use textcat
398	my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
399
400	# if textcat returns 3 or less possibilities we'll use the first one in the list
401	if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
402	my ($language, $encoding) = $results->[0] =~ m/^([^-])(?:-(.))?$/;
403
404	$language = $self->use_default_language($filename) unless defined $language;
405	$encoding = $self->use_default_encoding($filename) unless defined $encoding;
406
407	$best_language = $language;
408	$best_encoding = $encoding;
409	}
410	else { # if (scalar @$results > 3) {
411	if ($unicode_format) { # in case the first had a BOM
412	$best_encoding=$unicode_format;
413	}
414	else {
415	# Find the most frequent encoding in the textcat results returned
416	# Returns "" if there's no encoding more frequent than another
417	$best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
418	}
419
420	if ($best_encoding eq "") { # encoding still not set, use defaults
421	$best_language = $self->use_default_language($filename);
422	$best_encoding = $self->use_default_encoding($filename);
423	}
424	elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
425	$best_language = $self->use_default_language($filename);
426	}
427	else { # textcat again using the most frequent encoding or the $unicode_format set above
428	$results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
429	my $language;
430	($language) = $results->[0] =~ m/^([^-])(?:-(.))?$/ if (scalar @$results > 0);
431	if (!defined $language \|\| scalar @$results > 3) {
432	# if no result or too many results, use default language for the encoding previously found
433	$best_language = $self->use_default_language($filename);
434	}
435	else { # fewer than 3 results, use the language of the first result
436	$best_language = $language;
437	}
438	}
439	}
440	}
441	}
442
443	if($best_encoding eq "" \|\| $best_language eq "") {
444	print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
445	$best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
446	$best_language = $self->use_default_language($filename) if $best_language eq "";
447	}
448	# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
449
450
451	if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
452	# the text is valid utf8, so assume that's the real encoding
453	# (since textcat is based on probabilities)
454	$best_encoding = 'utf8';
455	}
456
457	# check for equivalents where textcat doesn't have some encodings...
458	# eg MS versions of standard encodings
459	if ($best_encoding =~ /^iso_8859_(\d+)/) {
460	my $iso = $1; # which variant of the iso standard?
461	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
462	if ($text =~ /[\x80-\x9f]/) {
463	# Western Europe
464	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
465	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
466	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
467	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
468	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
469	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
470	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
471	}
472	}
473
474	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
475	!defined $encodings::encodings->{$best_encoding}) {
476	if ($self->{'verbosity'}) {
477	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
478	$filename, $best_encoding, $self->{'default_encoding'});
479	}
480	$best_encoding = $self->{'default_encoding'};
481	}
482
483	return ($best_language, $best_encoding);
484	}
485
486
487	sub use_default_language {
488	my $self = shift (@_);
489	my ($filename) = @_;
490
491	if ($self->{'verbosity'}>2) {
492	gsprintf($self->{'outhandle'},
493	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
494	$filename, $self->{'default_language'});
495	}
496	return $self->{'default_language'};
497	}
498
499	sub use_default_encoding {
500	my $self = shift (@_);
501	my ($filename) = @_;
502
503	if ($self->{'verbosity'}>2) {
504	gsprintf($self->{'outhandle'},
505	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
506	$filename, $self->{'default_encoding'});
507	}
508	return $self->{'default_encoding'};
509	}
510
511	# Overridden by exploding plugins (eg. ISISPlug)
512	sub clean_up_after_exploding
513	{
514	my $self = shift(@_);
515	}
516
517
518	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: