Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 1676

Last change on this file since 1676 was 1676, checked in by paynter, 23 years ago
Plugins for processing files of bibliography records in BibTex and Refer format. SplitPlug is a plugin for splitting one text file into many Greenstone documents. ReferPlug and BibTextPlug (which both inherit from SplitPlug) are for processing individual Refer and BibTex records repectively.
Property svn:keywords set to `Author Date Id Revision`
File size: 5.0 KB

Rev	Line
[1676]	1	###########################################################################
	2	#
	3	# SplitPlug.pm - a plugin for splitting input files into segments that
	4	# will then be individually processed.
	5	#
	6	#
	7	# Copyright 2000 Gordon W. Paynter ([email protected])
	8	# Copyright 2000 The New Zealand Digital Library Project
	9	#
	10	# A component of the Greenstone digital library software
	11	# from the New Zealand Digital Library Project at the
	12	# University of Waikato, New Zealand.
	13	#
	14	# This program is free software; you can redistribute it and/or modify
	15	# it under the terms of the GNU General Public License as published by
	16	# the Free Software Foundation; either version 2 of the License, or
	17	# (at your option) any later version.
	18	#
	19	# This program is distributed in the hope that it will be useful,
	20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	22	# GNU General Public License for more details.
	23	#
	24	# You should have received a copy of the GNU General Public License
	25	# along with this program; if not, write to the Free Software
	26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	27	#
	28	###########################################################################
	29
	30
	31	# SplitPlug is a plugin for splitting input files into segments that will
	32	# then be individually processed.
	33
	34	# This plugin should not be called directly. Instead, if you need to
	35	# process input files that contain several documents, you should write a
	36	# plugin with a process function that will handle one of those documents
	37	# and have it inherit from SplitPlug. See ReferPlug for an example.
	38
	39
	40	package SplitPlug;
	41
	42	use BasPlug;
	43	use util;
	44
	45
	46	# SplitPlug is a sub-class of BasPlug.
	47	sub BEGIN {
	48	@ISA = ('BasPlug');
	49	}
	50
	51	sub new {
	52	my ($class) = @_;
	53	$self = new BasPlug($class, @_);
	54
	55	if (!parsargv::parse(\@_,
	56	q^split_exp/.*/^, \$self->{'split_exp'},
	57	"allow_extra_options")) {
	58	print STDERR "\nIncorrect options passed to $class.";
	59	print STDERR "\nCheck your collect.cfg configuration file\n";
	60	die "\n";
	61	}
	62
	63	return bless $self, $class;
	64	}
	65
	66	sub init {
	67	my $self = shift (@_);
	68	my ($verbosity, $outhandle) = @_;
	69
	70	$self->BasPlug::init($verbosity, $outhandle);
	71
	72	# set split_exp to default unless explicitly set
	73	if (!$self->{'split_exp'}) {
	74	$self->{'split_exp'} = $self->get_default_split_exp ();
	75	}
	76
	77	}
	78
	79	# This plugin recurs over the segments it finds
	80	sub is_recursive {
	81	return 1;
	82	}
	83
	84	# By default, we split the input text at blank lines
	85	sub get_default_split_exp {
	86	return q^\n\s*\n^;
	87	}
	88
	89
	90	# The read function opens a file and splits it into parts.
	91	# Each part is sent to the process function
	92	#
	93	# Returns: Number of document objects created (or undef if it fails)
	94
	95	sub read {
	96	my $self = shift (@_);
	97	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
	98	my $outhandle = $self->{'outhandle'};
	99	my $verbosity = $self->{'verbosity'};
	100
	101	# Figure out the exact filename of this file (and maybe block it)
	102	my $filename = &util::filename_cat($base_dir, $file);
	103	my $block_exp = $self->{'block_exp'};
	104	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
	105	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	106	return undef;
	107	}
	108	my $plugin_name = ref ($self);
	109	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	110
	111	# Read in file ($text will be in utf8)
	112	my $text = "";
	113	$self->read_file ($filename, \$text);
	114
	115	if ($text !~ /\w/) {
	116	my $outhandle = $self->{'outhandle'};
	117	print $outhandle "$plugin_name: ERROR: $file contains no text\n"
	118	if $self->{'verbosity'};
	119	return 0;
	120	}
	121
	122	# Split the text into several smaller segments
	123	my $split_exp = $self->{'split_exp'};
	124	my @segments = split(/$split_exp/, $text);
	125	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
	126	if $self->{'verbosity'};
	127
	128	# Process each segment in turn
	129	my ($count, $segment, $segtext, $status, $id);
	130	$segment = 0;
	131	$count = 0;
	132	foreach $segtext (@segments) {
	133	$segment++;
	134
	135	# create a new document
	136	my $doc_obj = new doc ($filename, "indexed_doc");
	137
	138	# Calculate a "base" document ID.
	139	if (!defined $id) {
	140	$doc_obj->set_OID();
	141	$id = $doc_obj->get_OID();
	142	}
	143
	144	# include any metadata passed in from previous plugins
	145	# note that this metadata is associated with the top level section
	146	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	147
	148	# do plugin specific processing of doc_obj
	149	print $outhandle "segment $segment - ";
	150	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
	151	if (!defined $status) {
	152	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
	153	if ($verbosity >= 2);
	154	next;
	155	}
	156	$count += $status;
	157
	158	# do any automatic metadata extraction
	159	$self->auto_extract_metadata ($doc_obj);
	160
	161	# add an OID
	162	$doc_obj->set_OID($id . "s" . $segment);
	163
	164	# process the document
	165	$processor->process($doc_obj);
	166	}
	167
	168	# Return number of document objects produced
	169	return $count;
	170	}
	171
	172
	173	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: