Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm@ 22844

Last change on this file since 22844 was 22844, checked in by davidb, 14 years ago
More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). Decoding the text read in at this point to UTF-8 means that (subsequent) regular expression processing of the data can make us Unicode aware operations, such as what Unicode classes as punctuation
Property svn:executable set to ``*
File size: 18.9 KB

Rev	Line
[15868]	1	###########################################################################
	2	#
	3	# ReadTxtFile.pm -- base class for import plugins that have plain text files
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package ReadTextFile;
	27
	28	use strict;
	29	no strict 'subs';
	30	no strict 'refs'; # allow filehandles to be variables and viceversa
	31
[22844]	32	use Encode;
[15868]	33
	34	use multiread;
	35	use encodings;
	36	use unicode;
	37	use textcat;
	38	use doc;
	39	use ghtml;
	40	use gsprintf 'gsprintf';
	41
	42	use AutoExtractMetadata;
	43
	44	sub BEGIN {
	45	@ReadTextFile::ISA = ( 'AutoExtractMetadata' );
	46	}
	47
[16014]	48	my $encoding_plus_auto_list =
	49	[ { 'name' => "auto",
	50	'desc' => "{ReadTextFile.input_encoding.auto}" } ];
	51	push(@{$encoding_plus_auto_list},@{$BasePlugin::encoding_list});
[15868]	52
	53	my $arguments =
	54	[ { 'name' => "input_encoding",
	55	'desc' => "{ReadTextFile.input_encoding}",
	56	'type' => "enum",
[16014]	57	'list' => $encoding_plus_auto_list,
[15868]	58	'reqd' => "no" ,
	59	'deft' => "auto" } ,
	60	{ 'name' => "default_encoding",
	61	'desc' => "{ReadTextFile.default_encoding}",
	62	'type' => "enum",
	63	'list' => $BasePlugin::encoding_list,
	64	'reqd' => "no",
	65	'deft' => "utf8" },
	66	{ 'name' => "extract_language",
	67	'desc' => "{ReadTextFile.extract_language}",
	68	'type' => "flag",
	69	'reqd' => "no" },
	70	{ 'name' => "default_language",
	71	'desc' => "{ReadTextFile.default_language}",
	72	'type' => "string",
	73	'deft' => "en",
[16642]	74	'reqd' => "no" }
	75	];
[15868]	76
	77
	78	my $options = { 'name' => "ReadTextFile",
[16014]	79	'desc' => "{ReadTextFile.desc}",
[15868]	80	'abstract' => "yes",
	81	'inherits' => "no",
	82	'args' => $arguments };
	83
	84
	85
	86	sub new {
	87	my $class = shift (@_);
[16699]	88	my ($pluginlist,$inputargs,$hashArgOptLists, $auxiliary) = @_;
[15868]	89	push(@$pluginlist, $class);
	90
	91	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	92	push(@{$hashArgOptLists->{"OptList"}},$options);
	93
[16699]	94	my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists, $auxiliary);
[15868]	95
	96	return bless $self, $class;
	97
	98	}
	99
	100
	101
	102	# The ReadTextFile read_into_doc_obj() function. This function does all the
	103	# right things to make general options work for a given plugin. It reads in
	104	# a file and sets up a slew of metadata all saved in doc_obj, which
	105	# it then returns as part of a tuple (process_status,doc_obj)
	106	#
	107	# Much of this functionality used to reside in read, but it was broken
	108	# down into a supporting routine to make the code more flexible.
	109	#
	110	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	111	# capable of processing many documents within a single file (e.g.
	112	# GMLPlug) will normally want to implement their own version of
	113	# read_into_doc_obj()
	114	#
	115	# Note that $base_dir might be "" and that $file might
	116	# include directories
	117	sub read_into_doc_obj {
	118	my $self = shift (@_);
[16392]	119	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[15868]	120
	121	my $outhandle = $self->{'outhandle'};
	122	# should we move this to read? What about secondary plugins?
	123	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
	124	print $outhandle "$self->{'plugin_type'} processing $file\n"
	125	if $self->{'verbosity'} > 1;
	126
[16392]	127	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	128
[15868]	129	# Do encoding stuff
	130	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path);
	131	if ($self->{'verbosity'} > 2) {
	132	print $outhandle "ReadTextFile: reading $file as ($encoding,$language)\n";
	133	}
	134
	135	# create a new document
[18320]	136	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[15868]	137	my $top_section = $doc_obj->get_top_section();
	138
	139	# this should look at the plugin option too...
	140	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
[16724]	141	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
[15868]	142	$self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
	143
	144	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
	145	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
	146
	147	# read in file ($text will be in utf8)
	148	my $text = "";
	149	$self->read_file ($filename_full_path, $encoding, $language, \$text);
	150
	151	if (!length ($text)) {
	152	if ($gli) {
	153	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
	154	}
	155	gsprintf($outhandle, "$self->{'plugin_type'}: {ReadTextFile.file_has_no_text}\n", $filename_full_path) if $self->{'verbosity'};
	156
	157	my $failhandle = $self->{'failhandle'};
	158	gsprintf($failhandle, "$file: " . ref($self) . ": {ReadTextFile.empty_file}\n");
	159	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
	160	$self->{'num_not_processed'} ++;
	161
	162	return (0,undef); # what should we return here?? error but don't want to pass it on
	163	}
	164
	165	# do plugin specific processing of doc_obj
	166	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
	167	$text = '';
	168	undef $text;
	169	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
	170	return (-1,undef);
	171	}
	172	$text='';
	173	undef $text;
	174
	175	# include any metadata passed in from previous plugins
	176	# note that this metadata is associated with the top level section
	177	$self->add_associated_files($doc_obj, $filename_full_path);
	178	$self->extra_metadata ($doc_obj, $top_section, $metadata);
	179
	180	# do any automatic metadata extraction
	181	$self->auto_extract_metadata ($doc_obj);
	182
	183
	184	# if we haven't found any Title so far, assign one
	185	$self->title_fallback($doc_obj,$top_section,$filename_no_path);
	186
	187	$self->add_OID($doc_obj);
	188
	189	return (1,$doc_obj);
	190	}
	191
	192	# uses the multiread package to read in the entire file pointed to
	193	# by filename and loads the resulting text into $$textref. Input text
	194	# may be in any of the encodings handled by multiread, output text
	195	# will be in utf8
	196	sub read_file {
	197	my $self = shift (@_);
	198	my ($filename, $encoding, $language, $textref) = @_;
	199
	200	if (!-r $filename)
	201	{
	202	my $outhandle = $self->{'outhandle'};
	203	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
	204	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
	205	return;
	206	}
	207	$$textref = "";
	208	if (!open (FILE, $filename)) {
	209	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
	210	die "\n";
	211	}
	212
	213	if ($encoding eq "ascii") {
	214	undef $/;
	215	$$textref = <FILE>;
	216	$/ = "\n";
	217	} else {
	218	my $reader = new multiread();
	219	$reader->set_handle ('ReadTextFile::FILE');
	220	$reader->set_encoding ($encoding);
	221	$reader->read_file ($textref);
	222	}
[22844]	223
	224	# At this point $$testref is a binary byte string
	225	# => turn it into a Unicode aware string, so full
	226	# Unicode aware pattern matching can be used.
	227	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
	228	#
	229
	230	$$textref = decode("utf8",$$textref);
	231
[15868]	232	close FILE;
	233	}
	234
	235
[22844]	236	# Not currently used
	237	sub read_file_usingPerlsEncodeModule {
	238	my $self = shift (@_);
	239	my ($filename, $encoding, $language, $textref) = @_;
	240
	241	if (!-r $filename)
	242	{
	243	my $outhandle = $self->{'outhandle'};
	244	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
	245	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
	246	return;
	247	}
	248	$$textref = "";
	249	if (!open (FILE, $filename)) {
	250	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
	251	or_reading} ($!)\n", $filename);
	252	die "\n";
	253	}
	254
	255	my $store_slash = $/;
	256	undef $/;
	257	my $text = <FILE>;
	258	$/ = $store_slash;
	259
	260	$$textref = decode($encoding,$text);
	261
	262	close FILE;
	263	}
	264
	265
[16724]	266	sub read_file_no_decoding {
	267	my $self = shift (@_);
	268	my ($filename, $textref) = @_;
	269
	270	if (!-r $filename)
	271	{
	272	my $outhandle = $self->{'outhandle'};
	273	gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
	274	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
	275	return;
	276	}
	277	$$textref = "";
	278	if (!open (FILE, $filename)) {
	279	gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
	280	die "\n";
	281	}
	282
	283	my $reader = new multiread();
	284	$reader->set_handle ('ReadTextFile::FILE');
	285	$reader->read_file_no_decoding ($textref);
	286
	287	$self->{'reader'} = $reader;
	288
	289	close FILE;
	290	}
	291
	292
	293	sub decode_text {
	294	my $self = shift (@_);
	295	my ($raw_text, $encoding, $language, $textref) = @_;
	296
	297	my $reader = $self->{'reader'};
	298	if (!defined $reader) {
	299	gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
	300	}
	301	else {
	302	$reader->set_encoding($encoding);
	303	$reader->decode_text($raw_text,$textref);
	304	}
	305	}
	306
	307
[15868]	308	sub textcat_get_language_encoding {
	309	my $self = shift (@_);
	310	my ($filename) = @_;
	311
	312	my ($language, $encoding, $extracted_encoding);
	313	if ($self->{'input_encoding'} eq "auto") {
	314	# use textcat to automatically work out the input encoding and language
	315	($language, $encoding) = $self->get_language_encoding ($filename);
	316	} elsif ($self->{'extract_language'}) {
[16724]	317	# use textcat to get language metadata
[15868]	318	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
	319	$encoding = $self->{'input_encoding'};
	320	# don't print this message for english... english in utf8 is identical
	321	# to english in iso-8859-1 (except for some punctuation). We don't have
	322	# a language model for en_utf8, so textcat always says iso-8859-1!
[16724]	323	if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
[15868]	324	my $plugin_name = ref ($self);
	325	my $outhandle = $self->{'outhandle'};
	326	gsprintf($outhandle, "$plugin_name: {ReadTextFile.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
	327	}
	328	} else {
	329	$language = $self->{'default_language'};
[16724]	330	$encoding = $self->{'input_encoding'};
[15868]	331	}
[16724]	332
	333	# print STDERR "** language encoding of contents of file $filename:\n\t**$language $encoding\n";
[15868]	334
	335	return ($language, $encoding);
	336	}
	337
[17213]	338
[15868]	339	# Uses textcat to work out the encoding and language of the text in
	340	# $filename. All html tags are removed before processing.
	341	# returns an array containing "language" and "encoding"
	342	sub get_language_encoding {
	343	my $self = shift (@_);
	344	my ($filename) = @_;
	345	my $outhandle = $self->{'outhandle'};
	346	my $unicode_format = "";
	347	my $best_language = "";
	348	my $best_encoding = "";
	349
[17213]	350
[15868]	351	# read in file
	352	if (!open (FILE, $filename)) {
	353	gsprintf(STDERR, "ReadTextFile::get_language_encoding {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
	354	# this is a pretty bad error, but try to continue anyway
	355	return ($self->{'default_language'}, $self->{'input_encoding'});
	356	}
	357	undef $/;
	358	my $text = <FILE>;
	359	$/ = "\n";
	360	close FILE;
	361
	362	# check if first few bytes have a Byte Order Marker
	363	my $bom=substr($text,0,2); # check 16bit unicode
	364	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
	365	$unicode_format="unicode";
	366	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
	367	$unicode_format="unicode";
	368	} else {
	369	$bom=substr($text,0,3); # check utf-8
	370	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
	371	$unicode_format="utf8";
	372	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
	373	# $unicode_format="utf8";
	374	}
	375	}
	376
[16667]	377	my $found_html_encoding = 0;
[15868]	378	# handle html files specially
	379	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
[15970]	380	if (ref($self) eq 'HTMLPlugin' \|\|
[15868]	381	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
	382
[16765]	383	# remove comments in head, including multiline ones, so that we don't match on
[16753]	384	# inactive tags (those that are nested inside comments)
[16765]	385	my ($head) = ($text =~ m/<head>(.*)<\/head>/si);
[17099]	386	$head = "" unless defined $head; # some files are not proper HTML eg php files
[16765]	387	$head =~ s/<!--.*?-->//sg;
[16753]	388
[15868]	389	# remove <title>stuff</title> -- as titles tend often to be in English
	390	# for foreign language documents
	391	$text =~ s!<title>.*?</title>!!si;
	392
	393	# see if this html file specifies its encoding
	394	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
	395	$best_encoding = $1;
[16724]	396	}
[16765]	397	# check the meta http-equiv charset tag
	398	elsif ($head =~ m/<meta http-equiv.content-type.charset=(.+?)\"/si) {
[15868]	399	$best_encoding = $1;
	400	}
	401	if ($best_encoding) { # we extracted an encoding
	402	$best_encoding =~ s/-+/_/g;
	403	$best_encoding = lc($best_encoding); # lowercase
	404	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
[16667]	405	$found_html_encoding = 1;
	406	# We shouldn't be modifying this here!!
	407	#$self->{'input_encoding'} = $best_encoding;
[15868]	408	}
	409
	410	# remove all HTML tags
	411	$text =~ s/<[^>]*>//sg;
	412	}
	413
[17213]	414	# don't need to do textcat if we know the encoding now AND don't need to extract language
	415	if($found_html_encoding && !$self->{'extract_language'}) { # encoding specified in html file
	416	$best_language = $self->{'default_language'};
	417	}
[15868]	418
[17213]	419	else { # need to use textcat to get either the language, or get both language and encoding
	420	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
	421
	422	if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
	423	my $results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
	424
	425	my $language;
	426	($language) = $results->[0] =~ m/^([^-])(?:-(?:.))?$/ if (scalar @$results > 0);
[15868]	427
[17213]	428	if (!defined $language \|\| scalar @$results > 3) {
	429	# if there were too many results even when restricting results by encoding,
	430	# or if there were no results, use default language with the known encoding
	431	$best_language = $self->use_default_language($filename);
	432	}
	433	else { # fewer than 3 results means textcat is more certain, use the first result
	434	$best_language = $language;
	435	}
[15868]	436	}
[17213]	437	else { # don't know encoding or language yet, therefore we use textcat
	438	my $results = $self->{'textcat'}->classify_contents(\$text, $filename);
	439
	440	# if textcat returns 3 or less possibilities we'll use the first one in the list
	441	if (scalar @$results <= 3) { # results will be > 0 when we don't constrain textcat by an encoding
	442	my ($language, $encoding) = $results->[0] =~ m/^([^-])(?:-(.))?$/;
[15868]	443
[17213]	444	$language = $self->use_default_language($filename) unless defined $language;
	445	$encoding = $self->use_default_encoding($filename) unless defined $encoding;
[15868]	446
[17213]	447	$best_language = $language;
	448	$best_encoding = $encoding;
[15868]	449	}
[17213]	450	else { # if (scalar @$results > 3) {
	451	if ($unicode_format) { # in case the first had a BOM
	452	$best_encoding=$unicode_format;
	453	}
	454	else {
	455	# Find the most frequent encoding in the textcat results returned
	456	# Returns "" if there's no encoding more frequent than another
	457	$best_encoding = $self->{'textcat'}->most_frequent_encoding($results);
	458	}
	459
	460	if ($best_encoding eq "") { # encoding still not set, use defaults
	461	$best_language = $self->use_default_language($filename);
	462	$best_encoding = $self->use_default_encoding($filename);
	463	}
	464	elsif (!$self->{'extract_language'}) { # know encoding but don't need to discover language
	465	$best_language = $self->use_default_language($filename);
	466	}
	467	else { # textcat again using the most frequent encoding or the $unicode_format set above
	468	$results = $self->{'textcat'}->classify_contents_for_encoding(\$text, $filename, $best_encoding);
	469	my $language;
	470	($language) = $results->[0] =~ m/^([^-])(?:-(.))?$/ if (scalar @$results > 0);
	471	if (!defined $language \|\| scalar @$results > 3) {
	472	# if no result or too many results, use default language for the encoding previously found
	473	$best_language = $self->use_default_language($filename);
	474	}
	475	else { # fewer than 3 results, use the language of the first result
	476	$best_language = $language;
	477	}
	478	}
[15868]	479	}
	480	}
	481	}
	482
[17213]	483	if($best_encoding eq "" \|\| $best_language eq "") {
	484	print STDERR "****Shouldn't happen: encoding and/or language still not set. Using defaults.\n";
	485	$best_encoding = $self->use_default_encoding($filename) if $best_encoding eq "";
	486	$best_language = $self->use_default_language($filename) if $best_language eq "";
	487	}
	488	# print STDERR "****Content language: $best_language; Encoding: $best_encoding.\n";
	489
	490
[16555]	491	if ($best_encoding =~ /^iso_8859/ && &unicode::check_is_utf8($text)) {
[15868]	492	# the text is valid utf8, so assume that's the real encoding
	493	# (since textcat is based on probabilities)
	494	$best_encoding = 'utf8';
	495	}
	496
	497	# check for equivalents where textcat doesn't have some encodings...
	498	# eg MS versions of standard encodings
	499	if ($best_encoding =~ /^iso_8859_(\d+)/) {
	500	my $iso = $1; # which variant of the iso standard?
	501	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
	502	if ($text =~ /[\x80-\x9f]/) {
	503	# Western Europe
	504	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
	505	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
	506	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
	507	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
	508	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
	509	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
	510	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
	511	}
	512	}
	513
	514	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
	515	!defined $encodings::encodings->{$best_encoding}) {
	516	if ($self->{'verbosity'}) {
	517	gsprintf($outhandle, "ReadTextFile: {ReadTextFile.unsupported_encoding}\n",
	518	$filename, $best_encoding, $self->{'default_encoding'});
	519	}
	520	$best_encoding = $self->{'default_encoding'};
	521	}
	522
	523	return ($best_language, $best_encoding);
	524	}
	525
	526
[17213]	527	sub use_default_language {
	528	my $self = shift (@_);
	529	my ($filename) = @_;
[15868]	530
[17213]	531	if ($self->{'verbosity'}>2) {
	532	gsprintf($self->{'outhandle'},
	533	"ReadTextFile: {ReadTextFile.could_not_extract_language}\n",
	534	$filename, $self->{'default_language'});
	535	}
	536	return $self->{'default_language'};
	537	}
	538
	539	sub use_default_encoding {
	540	my $self = shift (@_);
	541	my ($filename) = @_;
	542
	543	if ($self->{'verbosity'}>2) {
	544	gsprintf($self->{'outhandle'},
	545	"ReadTextFile: {ReadTextFile.could_not_extract_encoding}\n",
	546	$filename, $self->{'default_encoding'});
	547	}
	548	return $self->{'default_encoding'};
	549	}
	550
[15868]	551	# Overridden by exploding plugins (eg. ISISPlug)
	552	sub clean_up_after_exploding
	553	{
	554	my $self = shift(@_);
	555	}
	556
	557
	558	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: