Context Navigation

source: gsdl/trunk/perllib/plugins/SplitTextFile.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 8.9 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
[15871]	3	# SplitTextFile.pm - a plugin for splitting input files into segments that
[1676]	4	# will then be individually processed.
	5	#
	6	#
	7	# Copyright 2000 Gordon W. Paynter ([email protected])
	8	# Copyright 2000 The New Zealand Digital Library Project
	9	#
	10	# A component of the Greenstone digital library software
	11	# from the New Zealand Digital Library Project at the
	12	# University of Waikato, New Zealand.
	13	#
	14	# This program is free software; you can redistribute it and/or modify
	15	# it under the terms of the GNU General Public License as published by
	16	# the Free Software Foundation; either version 2 of the License, or
	17	# (at your option) any later version.
	18	#
	19	# This program is distributed in the hope that it will be useful,
	20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	22	# GNU General Public License for more details.
	23	#
	24	# You should have received a copy of the GNU General Public License
	25	# along with this program; if not, write to the Free Software
	26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	27	#
	28	###########################################################################
	29
	30
[15871]	31	# SplitTextFile is a plugin for splitting input files into segments that will
[1676]	32	# then be individually processed.
	33
	34	# This plugin should not be called directly. Instead, if you need to
	35	# process input files that contain several documents, you should write a
	36	# plugin with a process function that will handle one of those documents
[15871]	37	# and have it inherit from SplitTextFile. See ReferPlug for an example.
[1676]	38
	39
[15871]	40	package SplitTextFile;
[1676]	41
[15871]	42	use ReadTextFile;
[7830]	43	use gsprintf 'gsprintf';
[1676]	44	use util;
	45
[10254]	46	use strict;
	47	no strict 'refs'; # allow filehandles to be variables and viceversa
	48
[15871]	49	# SplitTextFile is a sub-class of BasPlug.
[8716]	50	sub BEGIN {
[15871]	51	@SplitTextFile::ISA = ('ReadTextFile');
[8716]	52	}
[1676]	53
	54
[4744]	55	my $arguments =
	56	[ { 'name' => "split_exp",
[15871]	57	'desc' => "{SplitTextFile.split_exp}",
[6408]	58	'type' => "regexp",
[10218]	59	#'deft' => &get_default_split_exp(),
	60	'deft' => "",
[4873]	61	'reqd' => "no" } ];
[4744]	62
[15871]	63	my $options = { 'name' => "SplitTextFile",
	64	'desc' => "{SplitTextFile.desc}",
[6408]	65	'abstract' => "yes",
[4744]	66	'inherits' => "yes",
	67	'args' => $arguments };
[3540]	68
	69
[1676]	70	sub new {
[10218]	71	my ($class) = shift (@_);
	72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	73	push(@$pluginlist, $class);
[1676]	74
[15871]	75	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	76	push(@{$hashArgOptLists->{"OptList"}},$options);
[1676]	77
[15871]	78	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	79
[9357]	80	$self->{'textcat_store'} = {};
	81	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
[1676]	82	return bless $self, $class;
	83	}
	84
	85	sub init {
	86	my $self = shift (@_);
[3094]	87	my ($verbosity, $outhandle, $failhandle) = @_;
[1676]	88
[15871]	89	$self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
[1676]	90
[15871]	91	# why is this is init and not in new??
[2007]	92	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
	93
	94	$self->{'process_exp'} = $self->get_default_process_exp ();
	95	if ($self->{'process_exp'} eq "") {
[11090]	96	warn ref($self) . " Warning: plugin has no process_exp\n";
[2007]	97	}
	98	}
	99
	100
[1676]	101	# set split_exp to default unless explicitly set
	102	if (!$self->{'split_exp'}) {
	103	$self->{'split_exp'} = $self->get_default_split_exp ();
	104	}
	105
	106	}
	107
	108	# This plugin recurs over the segments it finds
	109	sub is_recursive {
	110	return 1;
	111	}
	112
	113	# By default, we split the input text at blank lines
	114	sub get_default_split_exp {
	115	return q^\n\s*\n^;
	116	}
	117
[9357]	118	sub metadata_read {
	119	my $self = shift (@_);
	120	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[1676]	121
[15871]	122	# returns 1 if matches process_exp, and has done blocking in the meantime
[9357]	123	my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
	124	$metadata, $extrametakeys,
	125	$extrametadata, $processor,
	126	$maxdocs, $gli);
[10254]	127	my $split_matched = undef;
[9357]	128
	129	if ($matched) {
	130
	131	my $outhandle = $self->{'outhandle'};
	132	my $filename = &util::filename_cat($base_dir, $file);
	133
	134	my $plugin_name = ref ($self);
	135	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	136
	137	$self->{'metapass_srcdoc'}->{$file} = {};
	138
	139	# Do encoding stuff
	140	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
	141	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
	142	$self->{'textcat_store'}->{$file} = $le_rec;
	143
	144	# Read in file ($text will be in utf8)
	145	my $text = "";
	146	$self->read_file ($filename, $encoding, $language, \$text);
	147
[13197]	148
[9357]	149	if ($text !~ /\w/) {
[15871]	150	gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
[9357]	151	$file)
	152	if $self->{'verbosity'};
	153
	154	my $failhandle = $self->{'failhandle'};
	155	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
	156	$self->{'num_not_processed'} ++;
	157
	158	$self->{'textcat_store'}->{$file} = undef;
	159
	160	return 0;
	161	}
	162
	163
	164	# Split the text into several smaller segments
	165	my $split_exp = $self->{'split_exp'};
[13197]	166	my @tmp = split(/$split_exp/i, $text);
	167	my @segments =();
	168	## get rid of empty segments
	169	foreach my $seg (@tmp){
	170	if ($seg ne ""){
	171	push @segments, $seg;
	172	}
	173	}
	174
[15871]	175	print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
[9357]	176	if $self->{'verbosity'};
	177
[9493]	178	$self->{'split_segments'}->{$file} = \@segments;
[9357]	179	$split_matched = scalar(@segments);
	180	}
	181
	182	return $split_matched;
	183	}
	184
	185
	186
[1676]	187	# The read function opens a file and splits it into parts.
	188	# Each part is sent to the process function
	189	#
	190	# Returns: Number of document objects created (or undef if it fails)
	191
	192	sub read {
	193	my $self = shift (@_);
[9853]	194	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1676]	195	my $outhandle = $self->{'outhandle'};
	196	my $verbosity = $self->{'verbosity'};
	197
[11090]	198	#check process and block exps, smart block, etc
	199	my ($block_status,$filename) = $self->read_block(@_);
	200	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
	201
[1676]	202	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1894]	203
[9357]	204	my $le_rec = $self->{'textcat_store'}->{$file};
	205	if (!defined $le_rec) {
	206	# means no text was found;
	207	return 0; # not processed but no point in passing it on
	208	}
[2845]	209
[16104]	210	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
	211	print $outhandle "$self->{'plugin_type'} processing $file\n"
	212	if $self->{'verbosity'} > 1;
	213
[9357]	214	my $language = $le_rec->{'language'};
	215	my $encoding = $le_rec->{'encoding'};
	216	$self->{'textcat_store'}->{$file} = undef;
[1676]	217
[9493]	218	my $segments = $self->{'split_segments'}->{$file};
	219	$self->{'split_segments'}->{$file} = undef;
[2845]	220
[1676]	221	# Process each segment in turn
	222	my ($count, $segment, $segtext, $status, $id);
	223	$segment = 0;
	224	$count = 0;
[9357]	225	foreach $segtext (@$segments) {
[13197]	226	$segment++;
[2845]	227
[9357]	228	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
	229	# metadata is attached to a srcdoc
	230	next;
	231	}
	232
[1676]	233	# create a new document
	234	my $doc_obj = new doc ($filename, "indexed_doc");
[12270]	235	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[1894]	236	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
	237	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[2845]	238	my ($filemeta) = $file =~ /([^\\\/]+)$/;
[15871]	239	$self->set_Source_metadata($doc_obj, $filemeta, $encoding);
[6137]	240	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
[2845]	241	if ($self->{'cover_image'}) {
[11090]	242	$self->associate_cover_image($doc_obj, $filename);
[2845]	243	}
[7508]	244	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]	245	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
	246
[1676]	247	# Calculate a "base" document ID.
	248	if (!defined $id) {
	249	$doc_obj->set_OID();
	250	$id = $doc_obj->get_OID();
	251	}
	252
	253	# include any metadata passed in from previous plugins
	254	# note that this metadata is associated with the top level section
	255	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	256
	257	# do plugin specific processing of doc_obj
[16104]	258	print $outhandle "segment $segment\n" if ($self->{'verbosity'});
[11335]	259	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[1676]	260	if (!defined $status) {
	261	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
	262	if ($verbosity >= 2);
	263	next;
	264	}
[3537]	265	# If the plugin returned 0, it threw away this part
	266	if ($status == 0) {
	267	next;
	268	}
[1676]	269	$count += $status;
	270
	271	# do any automatic metadata extraction
	272	$self->auto_extract_metadata ($doc_obj);
	273
	274	# add an OID
[2484]	275	$self->set_OID($doc_obj, $id, $segment);
	276
[1676]	277	# process the document
	278	$processor->process($doc_obj);
[2845]	279
	280	$self->{'num_processed'} ++;
[1676]	281	}
	282
[9357]	283	delete $self->{'metapass_srcdoc'}->{$file};
	284
[1676]	285	# Return number of document objects produced
	286	return $count;
	287	}
	288
[2484]	289	sub set_OID {
	290	my $self = shift (@_);
	291	my ($doc_obj, $id, $segment_number) = @_;
	292
[2492]	293	$doc_obj->set_OID($id . "s" . $segment_number);
[2484]	294	}
	295
[1676]	296	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: