Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/trunk/perllib/plugins/ReadTextFile.pm@ 15880

Last change on this file since 15880 was 15868, checked in by kjdon, 16 years ago
plugin overhaul: BasPlug has been split into several base plugins: PrintInfo just does the printing for pluginfo.pl, and does the argument parsing in the constructor. All plugins and supporting extractors etc inherit directly or indirectly from this. AbstractPlugin adds a few methods to this, is used by Directory and ArchivesInf plugins. These are not really plugins so can we remove them? anyway, not sure if AbstractPlugin will live for very long. BasePlugin is a proper base plugin, has read and read_into_doc_obj methods. It does nothing with reading in the file or textcat stuff. Makes a basic doc obj and adds some metadata. It also handles all the blocking stuff, associate ext stuff etc. Binary plugins can implement the process method to do file specific stuff. AutoExtractMetadata inherits BasePlugin and adds automatic metadata extraction using hte new Extractor plugins. ReadTextFile is the equivalent in functionality to the old BasPlug - does lang and encoding extraction, and reading in the file. It inherits from AutoExtractMetadata. If your file type is binary and will have no text, then inherit from BasePlugin. If its binary but ends up with text (eg using convert_to) then inherit from AutoExtractMetadata. If your file is a text type file, then inherit from ReadTextFile.
Property svn:executable set to ``*
File size: 15.7 KB

Line
1	###########################################################################
2	#
3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package ReadTextFile;
27
28	use strict;
29	no strict 'subs';
30	no strict 'refs'; # allow filehandles to be variables and viceversa
31
32
33	use multiread;
34	use encodings;
35	use unicode;
36	use cnseg;
37	use textcat;
38	use doc;
39	eval "require diagnostics"; # some perl distros (eg mac) don't have this
40	use ghtml;
41	use gsprintf 'gsprintf';
42
43	use AutoExtractMetadata;
44
45	sub BEGIN {
46	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
47	}
48
49
50	my $arguments =
51	[ { 'name' => "input_encoding",
52	'desc' => "{ReadTextFile.input_encoding}",
53	'type' => "enum",
54	'list' => $BasePlugin::encoding_plus_auto_list,
55	'reqd' => "no" ,
56	'deft' => "auto" } ,
57	{ 'name' => "default_encoding",
58	'desc' => "{ReadTextFile.default_encoding}",
59	'type' => "enum",
60	'list' => $BasePlugin::encoding_list,
61	'reqd' => "no",
62	'deft' => "utf8" },
63	{ 'name' => "extract_language",
64	'desc' => "{ReadTextFile.extract_language}",
65	'type' => "flag",
66	'reqd' => "no" },
67	{ 'name' => "default_language",
68	'desc' => "{ReadTextFile.default_language}",
69	'type' => "string",
70	'deft' => "en",
71	'reqd' => "no" },
72	{ 'name' => "first",
73	'desc' => "{ReadTextFile.first}",
74	'type' => "string",
75	'reqd' => "no" },
76	{ 'name' => "separate_cjk",
77	'desc' => "{ReadTextFile.separate_cjk}",
78	'type' => "flag",
79	'reqd' => "no",
80	'hiddengli' => "yes" },
81	];
82
83
84	my $options = { 'name' => "ReadTextFile",
85	'desc' => "{ReadTxtFile.desc}",
86	'abstract' => "yes",
87	'inherits' => "no",
88	'args' => $arguments };
89
90
91
92	sub new {
93	my $class = shift (@_);
94	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
95	push(@$pluginlist, $class);
96
97	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
98	push(@{$hashArgOptLists->{"OptList"}},$options);
99
100	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
101
102	return bless $self, $class;
103
104	}
105
106
107
108	# The ReadTextFile read_into_doc_obj() function. This function does all the
109	# right things to make general options work for a given plugin. It reads in
110	# a file and sets up a slew of metadata all saved in doc_obj, which
111	# it then returns as part of a tuple (process_status,doc_obj)
112	#
113	# Much of this functionality used to reside in read, but it was broken
114	# down into a supporting routine to make the code more flexible.
115	#
116	# recursive plugins (e.g. RecPlug) and specialized plugins like those
117	# capable of processing many documents within a single file (e.g.
118	# GMLPlug) will normally want to implement their own version of
119	# read_into_doc_obj()
120	#
121	# Note that $base_dir might be "" and that $file might
122	# include directories
123	sub read_into_doc_obj {
124	my $self = shift (@_);
125	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
126
127	my $outhandle = $self->{'outhandle'};
128
129	# should we move this to read? What about secondary plugins?
130	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
131	print $outhandle "$self->{'plugin_type'} processing $file\n"
132	if $self->{'verbosity'} > 1;
133
134	my ($filename_full_path, $filename_no_path) = $self->get_full_filenames($base_dir, $file);
135	# Do encoding stuff
136	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
137	if ($self->{'verbosity'} > 2) {
138	print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n";
139	}
140
141	# create a new document
142	my $doc_obj = new doc ($filename_full_path, "indexed_doc");
143	my $top_section = $doc_obj->get_top_section();
144
145	# this should look at the plugin option too...
146	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
147	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
148	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
149	$self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
150
151	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
152	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
153
154	# read in file ($text will be in utf8)
155	my $text = "";
156	$self->read_file ($filename_full_path, $encoding, $language, \$text);
157
158	if (!length ($text)) {
159	if ($gli) {
160	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
161	}
162	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
163
164	my $failhandle = $self->{'failhandle'};
165	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
166	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
167	$self->{'num_not_processed'} ++;
168
169	return (0,undef); # what should we return here?? error but don't want to pass it on
170	}
171
172	# do plugin specific processing of doc_obj
173	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
174	$text = '';
175	undef $text;
176	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
177	return (-1,undef);
178	}
179	$text='';
180	undef $text;
181
182	# include any metadata passed in from previous plugins
183	# note that this metadata is associated with the top level section
184	$self->add_associated_files($doc_obj, $filename_full_path);
185	$self->extra_metadata ($doc_obj, $top_section, $metadata);
186
187	# do any automatic metadata extraction
188	$self->auto_extract_metadata ($doc_obj);
189
190
191	# if we haven't found any Title so far, assign one
192	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
193
194	$self->add_OID($doc_obj);
195
196	return (1,$doc_obj);
197	}
198
199	sub auto_extract_metadata {
200	my $self = shift(@_);
201	my ($doc_obj) = @_;
202
203	if ($self->{'first'}) {
204	my $thissection = $doc_obj->get_top_section();
205	while (defined $thissection) {
206	my $text = $doc_obj->get_text($thissection);
207	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
208	$thissection = $doc_obj->get_next_section ($thissection);
209	}
210	}
211	$self->SUPER::auto_extract_metadata($doc_obj);
212	}
213
214	# uses the multiread package to read in the entire file pointed to
215	# by filename and loads the resulting text into $$textref. Input text
216	# may be in any of the encodings handled by multiread, output text
217	# will be in utf8
218	sub read_file {
219	my $self = shift (@_);
220	my ($filename, $encoding, $language, $textref) = @_;
221
222	if (!-r $filename)
223	{
224	my $outhandle = $self->{'outhandle'};
225	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
226	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
227	return;
228	}
229	$$textref = "";
230	if (!open (FILE, $filename)) {
231	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
232	die "\n";
233	}
234
235	if ($encoding eq "ascii") {
236	undef $/;
237	$$textref = <FILE>;
238	$/ = "\n";
239	} else {
240	my $reader = new multiread();
241	$reader->set_handle ('ReadTextFile::FILE');
242	$reader->set_encoding ($encoding);
243	$reader->read_file ($textref);
244	#Now segments chinese if the separate_cjk option is set
245	if ($self->{'separate_cjk'}) {
246	# segment the Chinese words
247	$$textref = &cnseg::segment($$textref);
248	}
249	}
250	close FILE;
251	}
252
253
254	sub textcat_get_language_encoding {
255	my $self = shift (@_);
256	my ($filename) = @_;
257
258
259	my ($language, $encoding, $extracted_encoding);
260	if ($self->{'input_encoding'} eq "auto") {
261	# use textcat to automatically work out the input encoding and language
262	($language, $encoding) = $self->get_language_encoding ($filename);
263	} elsif ($self->{'extract_language'}) {
264	# use textcat to get language metadata
265	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
266	$encoding = $self->{'input_encoding'};
267	# don't print this message for english... english in utf8 is identical
268	# to english in iso-8859-1 (except for some punctuation). We don't have
269	# a language model for en_utf8, so textcat always says iso-8859-1!
270	if ($extracted_encoding ne $encoding && $language ne "en"
271	&& $self->{'verbosity'}) {
272	my $plugin_name = ref ($self);
273	my $outhandle = $self->{'outhandle'};
274	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
275	}
276	} else {
277	$language = $self->{'default_language'};
278	$encoding = $self->{'input_encoding'};
279	}
280
281	return ($language, $encoding);
282	}
283
284	# Uses textcat to work out the encoding and language of the text in
285	# $filename. All html tags are removed before processing.
286	# returns an array containing "language" and "encoding"
287	sub get_language_encoding {
288	my $self = shift (@_);
289	my ($filename) = @_;
290	my $outhandle = $self->{'outhandle'};
291	my $unicode_format = "";
292	my $best_language = "";
293	my $best_encoding = "";
294
295	# read in file
296	if (!open (FILE, $filename)) {
297	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
298	# this is a pretty bad error, but try to continue anyway
299	return ($self->{'default_language'}, $self->{'input_encoding'});
300	}
301	undef $/;
302	my $text = <FILE>;
303	$/ = "\n";
304	close FILE;
305
306	# check if first few bytes have a Byte Order Marker
307	my $bom=substr($text,0,2); # check 16bit unicode
308	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
309	$unicode_format="unicode";
310	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
311	$unicode_format="unicode";
312	} else {
313	$bom=substr($text,0,3); # check utf-8
314	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
315	$unicode_format="utf8";
316	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
317	# $unicode_format="utf8";
318	}
319	}
320
321
322	# handle html files specially
323	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
324	if (ref($self) eq 'HTMLPlug' \|\|
325	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
326
327	# remove <title>stuff</title> -- as titles tend often to be in English
328	# for foreign language documents
329	$text =~ s!<title>.*?</title>!!si;
330
331	# see if this html file specifies its encoding
332	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
333	$best_encoding = $1;
334	} elsif ($text =~ /<meta http-equiv.content-type.charset=(.+?)"/i) {#"
335	$best_encoding = $1;
336	}
337	if ($best_encoding) { # we extracted an encoding
338	$best_encoding =~ s/-+/_/g;
339	$best_encoding = lc($best_encoding); # lowercase
340	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
341	$self->{'input_encoding'} = $best_encoding;
342	}
343
344	# remove all HTML tags
345	$text =~ s/<[^>]*>//sg;
346	}
347
348	# get the language/encoding
349	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
350	my $results = $self->{'textcat'}->classify(\$text);
351
352	# if textcat returns 3 or less possibilities we'll use the
353	# first one in the list - otherwise use the defaults
354	if (scalar @$results > 3) {
355	if ($unicode_format) { # in case the first had a BOM
356	$best_encoding=$unicode_format;
357	} else {
358	my %guessed_encodings = ();
359	foreach my $result (@$results) {
360	$result =~ /([^\-]+)$/;
361	my $enc=$1;
362	if (!defined($guessed_encodings{$enc})) {
363	$guessed_encodings{$enc}=0;
364	}
365	$guessed_encodings{$enc}++;
366	}
367
368	$guessed_encodings{""}=-1; # for default best_encoding of ""
369	foreach my $enc (keys %guessed_encodings) {
370	if ($guessed_encodings{$enc} >
371	$guessed_encodings{$best_encoding}){
372	$best_encoding=$enc;
373	}
374	}
375	}
376
377	if ($self->{'input_encoding'} ne 'auto') {
378	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
379	gsprintf($outhandle,
380	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
381	$filename, $self->{'default_language'});
382	}
383	$best_language = $self->{'default_language'};
384	$best_encoding = $self->{'input_encoding'};
385
386	} else {
387	if ($self->{'verbosity'}>2) {
388	gsprintf($outhandle,
389	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
390	$filename, $self->{'default_language'});
391	}
392	$best_language = $self->{'default_language'};
393	}
394	} else { # <= 3 suggestions
395	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
396	if (!defined $language) {
397	if ($self->{'verbosity'}>2) {
398	gsprintf($outhandle,
399	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
400	$filename, $self->{'default_language'});
401	}
402	$language = $self->{'default_language'};
403	}
404	if (!defined $encoding) {
405	if ($self->{'verbosity'}>2) {
406	gsprintf($outhandle,
407	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
408	$filename, $self->{'default_encoding'});
409	}
410	$encoding = $self->{'default_encoding'};
411	}
412	$best_language = $language;
413	if (! $best_encoding ) { # may already be set... eg from html meta tag
414	$best_encoding = $encoding;
415	}
416	}
417
418	my $text_copy = $text;
419	if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
420	# the text is valid utf8, so assume that's the real encoding
421	# (since textcat is based on probabilities)
422	$best_encoding = 'utf8';
423	}
424
425	# check for equivalents where textcat doesn't have some encodings...
426	# eg MS versions of standard encodings
427	if ($best_encoding =~ /^iso_8859_(\d+)/) {
428	my $iso = $1; # which variant of the iso standard?
429	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
430	if ($text =~ /[\x80-\x9f]/) {
431	# Western Europe
432	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
433	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
434	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
435	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
436	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
437	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
438	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
439	}
440	}
441
442	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
443	!defined $encodings::encodings->{$best_encoding}) {
444	if ($self->{'verbosity'}) {
445	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
446	$filename, $best_encoding, $self->{'default_encoding'});
447	}
448	$best_encoding = $self->{'default_encoding'};
449	}
450
451	return ($best_language, $best_encoding);
452	}
453
454
455
456	# FIRSTNNN: extract the first NNN characters as metadata
457	sub extract_first_NNNN_characters {
458	my $self = shift (@_);
459	my ($textref, $doc_obj, $thissection) = @_;
460
461	foreach my $size (split /,/, $self->{'first'}) {
462	my $tmptext = $$textref;
463	$tmptext =~ s/^\s+//;
464	$tmptext =~ s/\s+$//;
465	$tmptext =~ s/\s+/ /gs;
466	$tmptext = substr ($tmptext, 0, $size);
467	$tmptext =~ s/\s\S*$/…/;
468	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
469	}
470	}
471
472	# Overridden by exploding plugins (eg. ISISPlug)
473	sub clean_up_after_exploding
474	{
475	my $self = shift(@_);
476	}
477
478
479	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: