Context Navigation

source: gsdl/trunk/perllib/plugins/SplitTextFile.pm@ 18327

Last change on this file since 18327 was 18327, checked in by ak19, 15 years ago
Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).
Property svn:keywords set to `Author Date Id Revision`
File size: 9.4 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
[15871]	3	# SplitTextFile.pm - a plugin for splitting input files into segments that
[1676]	4	# will then be individually processed.
	5	#
	6	#
	7	# Copyright 2000 Gordon W. Paynter ([email protected])
	8	# Copyright 2000 The New Zealand Digital Library Project
	9	#
	10	# A component of the Greenstone digital library software
	11	# from the New Zealand Digital Library Project at the
	12	# University of Waikato, New Zealand.
	13	#
	14	# This program is free software; you can redistribute it and/or modify
	15	# it under the terms of the GNU General Public License as published by
	16	# the Free Software Foundation; either version 2 of the License, or
	17	# (at your option) any later version.
	18	#
	19	# This program is distributed in the hope that it will be useful,
	20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	22	# GNU General Public License for more details.
	23	#
	24	# You should have received a copy of the GNU General Public License
	25	# along with this program; if not, write to the Free Software
	26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	27	#
	28	###########################################################################
	29
	30
[15871]	31	# SplitTextFile is a plugin for splitting input files into segments that will
[1676]	32	# then be individually processed.
	33
	34	# This plugin should not be called directly. Instead, if you need to
	35	# process input files that contain several documents, you should write a
	36	# plugin with a process function that will handle one of those documents
[15871]	37	# and have it inherit from SplitTextFile. See ReferPlug for an example.
[1676]	38
	39
[15871]	40	package SplitTextFile;
[1676]	41
[15871]	42	use ReadTextFile;
[7830]	43	use gsprintf 'gsprintf';
[1676]	44	use util;
	45
[10254]	46	use strict;
	47	no strict 'refs'; # allow filehandles to be variables and viceversa
	48
[16700]	49	# SplitTextFile is a sub-class of ReadTextFile
[8716]	50	sub BEGIN {
[15871]	51	@SplitTextFile::ISA = ('ReadTextFile');
[8716]	52	}
[1676]	53
	54
[4744]	55	my $arguments =
	56	[ { 'name' => "split_exp",
[15871]	57	'desc' => "{SplitTextFile.split_exp}",
[6408]	58	'type' => "regexp",
[10218]	59	#'deft' => &get_default_split_exp(),
	60	'deft' => "",
[4873]	61	'reqd' => "no" } ];
[4744]	62
[15871]	63	my $options = { 'name' => "SplitTextFile",
	64	'desc' => "{SplitTextFile.desc}",
[6408]	65	'abstract' => "yes",
[4744]	66	'inherits' => "yes",
	67	'args' => $arguments };
[3540]	68
	69
[1676]	70	sub new {
[10218]	71	my ($class) = shift (@_);
	72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	73	push(@$pluginlist, $class);
[1676]	74
[15871]	75	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	76	push(@{$hashArgOptLists->{"OptList"}},$options);
[1676]	77
[15871]	78	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]	79
[9357]	80	$self->{'textcat_store'} = {};
	81	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
[1676]	82	return bless $self, $class;
	83	}
	84
	85	sub init {
	86	my $self = shift (@_);
[3094]	87	my ($verbosity, $outhandle, $failhandle) = @_;
[1676]	88
[15871]	89	$self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
[1676]	90
[15871]	91	# why is this is init and not in new??
[2007]	92	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
	93
	94	$self->{'process_exp'} = $self->get_default_process_exp ();
	95	if ($self->{'process_exp'} eq "") {
[11090]	96	warn ref($self) . " Warning: plugin has no process_exp\n";
[2007]	97	}
	98	}
	99
	100
[1676]	101	# set split_exp to default unless explicitly set
	102	if (!$self->{'split_exp'}) {
	103	$self->{'split_exp'} = $self->get_default_split_exp ();
	104	}
	105
	106	}
	107
	108	# This plugin recurs over the segments it finds
	109	sub is_recursive {
	110	return 1;
	111	}
	112
	113	# By default, we split the input text at blank lines
	114	sub get_default_split_exp {
	115	return q^\n\s*\n^;
	116	}
	117
[9357]	118	sub metadata_read {
	119	my $self = shift (@_);
[17300]	120	my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[1676]	121
[15871]	122	# returns 1 if matches process_exp, and has done blocking in the meantime
[16392]	123	my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
	124	$block_hash,
[17300]	125	$extrametakeys,
[9357]	126	$extrametadata, $processor,
	127	$maxdocs, $gli);
[10254]	128	my $split_matched = undef;
[9357]	129
	130	if ($matched) {
	131
	132	my $outhandle = $self->{'outhandle'};
	133	my $filename = &util::filename_cat($base_dir, $file);
	134
	135	my $plugin_name = ref ($self);
	136	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	137
	138	$self->{'metapass_srcdoc'}->{$file} = {};
	139
	140	# Do encoding stuff
	141	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
	142	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
	143	$self->{'textcat_store'}->{$file} = $le_rec;
	144
	145	# Read in file ($text will be in utf8)
	146	my $text = "";
	147	$self->read_file ($filename, $encoding, $language, \$text);
	148
[13197]	149
[9357]	150	if ($text !~ /\w/) {
[15871]	151	gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
[9357]	152	$file)
	153	if $self->{'verbosity'};
	154
	155	my $failhandle = $self->{'failhandle'};
	156	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
	157	$self->{'num_not_processed'} ++;
	158
	159	$self->{'textcat_store'}->{$file} = undef;
	160
	161	return 0;
	162	}
	163
	164
	165	# Split the text into several smaller segments
	166	my $split_exp = $self->{'split_exp'};
[13197]	167	my @tmp = split(/$split_exp/i, $text);
	168	my @segments =();
	169	## get rid of empty segments
	170	foreach my $seg (@tmp){
	171	if ($seg ne ""){
	172	push @segments, $seg;
	173	}
	174	}
	175
[15871]	176	print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
[9357]	177	if $self->{'verbosity'};
	178
[9493]	179	$self->{'split_segments'}->{$file} = \@segments;
[9357]	180	$split_matched = scalar(@segments);
	181	}
	182
	183	return $split_matched;
	184	}
	185
	186
	187
[1676]	188	# The read function opens a file and splits it into parts.
	189	# Each part is sent to the process function
	190	#
	191	# Returns: Number of document objects created (or undef if it fails)
	192
	193	sub read {
	194	my $self = shift (@_);
[16392]	195	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1676]	196	my $outhandle = $self->{'outhandle'};
	197	my $verbosity = $self->{'verbosity'};
	198
[16392]	199	# can we process this file??
	200	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
	201	return undef unless $self->can_process_this_file($filename_full_path);
[11090]	202
[1676]	203	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1894]	204
[9357]	205	my $le_rec = $self->{'textcat_store'}->{$file};
	206	if (!defined $le_rec) {
	207	# means no text was found;
	208	return 0; # not processed but no point in passing it on
	209	}
[2845]	210
[16104]	211	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
	212	print $outhandle "$self->{'plugin_type'} processing $file\n"
	213	if $self->{'verbosity'} > 1;
	214
[9357]	215	my $language = $le_rec->{'language'};
	216	my $encoding = $le_rec->{'encoding'};
	217	$self->{'textcat_store'}->{$file} = undef;
[1676]	218
[9493]	219	my $segments = $self->{'split_segments'}->{$file};
	220	$self->{'split_segments'}->{$file} = undef;
[2845]	221
[1676]	222	# Process each segment in turn
	223	my ($count, $segment, $segtext, $status, $id);
	224	$segment = 0;
	225	$count = 0;
[9357]	226	foreach $segtext (@$segments) {
[13197]	227	$segment++;
[2845]	228
[9357]	229	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
	230	# metadata is attached to a srcdoc
	231	next;
	232	}
	233
[1676]	234	# create a new document
[18327]	235	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[1894]	236	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
	237	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[2845]	238	my ($filemeta) = $file =~ /([^\\\/]+)$/;
[15871]	239	$self->set_Source_metadata($doc_obj, $filemeta, $encoding);
[6137]	240	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
[2845]	241	if ($self->{'cover_image'}) {
[16392]	242	$self->associate_cover_image($doc_obj, $filename_full_path);
[2845]	243	}
[7508]	244	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]	245	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
	246
[1676]	247	# Calculate a "base" document ID.
	248	if (!defined $id) {
[17026]	249	$id = $self->get_base_OID($doc_obj);
[1676]	250	}
	251
	252	# include any metadata passed in from previous plugins
	253	# note that this metadata is associated with the top level section
	254	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	255
	256	# do plugin specific processing of doc_obj
[16104]	257	print $outhandle "segment $segment\n" if ($self->{'verbosity'});
[11335]	258	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[1676]	259	if (!defined $status) {
	260	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
	261	if ($verbosity >= 2);
	262	next;
	263	}
[3537]	264	# If the plugin returned 0, it threw away this part
	265	if ($status == 0) {
	266	next;
	267	}
[1676]	268	$count += $status;
	269
	270	# do any automatic metadata extraction
	271	$self->auto_extract_metadata ($doc_obj);
	272
	273	# add an OID
[17026]	274	$self->add_OID($doc_obj, $id, $segment);
[2484]	275
[1676]	276	# process the document
	277	$processor->process($doc_obj);
[2845]	278
	279	$self->{'num_processed'} ++;
[1676]	280	}
	281
[9357]	282	delete $self->{'metapass_srcdoc'}->{$file};
	283
[1676]	284	# Return number of document objects produced
	285	return $count;
	286	}
	287
[17026]	288	sub get_base_OID {
	289	my $self = shift(@_);
	290	my ($doc_obj) = @_;
	291
	292	$self->SUPER::add_OID($doc_obj);
	293	return $doc_obj->get_OID();
	294	}
	295
	296	sub add_OID {
[2484]	297	my $self = shift (@_);
[17026]	298	my ($doc_obj, $id, $segment) = @_;
	299
	300	my $full_id = $id . "s" . $segment;
	301	if ($self->{'OIDtype'} eq "assigned") {
	302	my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
	303	if (defined $identifier && $identifier ne "") {
	304	$full_id = $identifier;
[17033]	305	$full_id =~ s/\.//g; #remove any periods
[17026]	306	if ($full_id =~ /^[\d]*$/) {
	307	$full_id = "D" . $full_id;
	308	print STDERR "OID only contains numbers, adding a D\n";
	309	}
	310	}
	311	}
	312	$doc_obj->set_OID($full_id);
[2484]	313	}
	314
[17026]	315
[1676]	316	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: