Context Navigation

source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 7644

Last change on this file since 7644 was 7508, checked in by kjdon, 20 years ago
changed the plugin metadata - instead of having eg HTMLPlug metadata set to 1, now we have Plugin metadata set to HTMLPlug
Property svn:keywords set to `Author Date Id Revision`
File size: 7.1 KB

Line
1	###########################################################################
2	#
3	# SplitPlug.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitPlug is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40	package SplitPlug;
41
42	use BasPlug;
43	use util;
44
45
46	# SplitPlug is a sub-class of BasPlug.
47	sub BEGIN {
48	@ISA = ('BasPlug');
49	}
50
51	my $arguments =
52	[ { 'name' => "split_exp",
53	'desc' => "{SplitPlug.split_exp}",
54	'type' => "regexp",
55	'deft' => &get_default_split_exp(),
56	'reqd' => "no" } ];
57
58	my $options = { 'name' => "SplitPlug",
59	'desc' => "{SplitPlug.desc}",
60	'abstract' => "yes",
61	'inherits' => "yes",
62	'args' => $arguments };
63
64
65	sub new {
66	my ($class) = @_;
67	$self = new BasPlug($class, @_);
68
69	$self->{'plugin_type'} = "SplitPlug";
70
71	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
72	my $option_list = $self->{'option_list'};
73	push( @{$option_list}, $options );
74
75	if (!parsargv::parse(\@_,
76	q^split_exp/.*/^, \$self->{'split_exp'},
77	"allow_extra_options")) {
78	print STDERR "\nIncorrect options passed to $class.";
79	print STDERR "\nCheck your collect.cfg configuration file\n";
80	die "\n";
81	}
82
83	return bless $self, $class;
84	}
85
86	sub init {
87	my $self = shift (@_);
88	my ($verbosity, $outhandle, $failhandle) = @_;
89
90	$self->BasPlug::init($verbosity, $outhandle, $failhandle);
91
92	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
93
94	$self->{'process_exp'} = $self->get_default_process_exp ();
95	if ($self->{'process_exp'} eq "") {
96	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
97	}
98	}
99
100
101	# set split_exp to default unless explicitly set
102	if (!$self->{'split_exp'}) {
103	$self->{'split_exp'} = $self->get_default_split_exp ();
104	}
105
106	}
107
108	# This plugin recurs over the segments it finds
109	sub is_recursive {
110	return 1;
111	}
112
113	# By default, we split the input text at blank lines
114	sub get_default_split_exp {
115	return q^\n\s*\n^;
116	}
117
118
119	# The read function opens a file and splits it into parts.
120	# Each part is sent to the process function
121	#
122	# Returns: Number of document objects created (or undef if it fails)
123
124	sub read {
125	my $self = shift (@_);
126	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
127	my $outhandle = $self->{'outhandle'};
128	my $verbosity = $self->{'verbosity'};
129
130	# Figure out the exact filename of this file (and maybe block it)
131	my $filename = &util::filename_cat($base_dir, $file);
132	my $block_exp = $self->{'block_exp'};
133	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
134	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
135	return undef;
136	}
137	my $plugin_name = ref ($self);
138	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
139
140	# Do encoding stuff
141	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
142
143	# Read in file ($text will be in utf8)
144	my $text = "";
145	$self->read_file ($filename, $encoding, $language, \$text);
146
147	if ($text !~ /\w/) {
148	my $outhandle = $self->{'outhandle'};
149	print $outhandle "$plugin_name: ERROR: $file contains no text\n"
150	if $self->{'verbosity'};
151
152	my $failhandle = $self->{'failhandle'};
153	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
154	$self->{'num_not_processed'} ++;
155
156	return 0; # not processed but no point in passing it on
157	}
158
159
160	# Split the text into several smaller segments
161	my $split_exp = $self->{'split_exp'};
162	my @segments = split(/$split_exp/, $text);
163	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
164	if $self->{'verbosity'};
165
166	# Process each segment in turn
167	my ($count, $segment, $segtext, $status, $id);
168	$segment = 0;
169	$count = 0;
170	foreach $segtext (@segments) {
171	$segment++;
172
173	# create a new document
174	my $doc_obj = new doc ($filename, "indexed_doc");
175	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
176	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
177	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
178	my ($filemeta) = $file =~ /([^\\\/]+)$/;
179	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
180	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
181	if ($self->{'cover_image'}) {
182	$self->associate_cover_image($doc_obj, $filename);
183	}
184	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
185	# Calculate a "base" document ID.
186	if (!defined $id) {
187	$doc_obj->set_OID();
188	$id = $doc_obj->get_OID();
189	}
190
191	# include any metadata passed in from previous plugins
192	# note that this metadata is associated with the top level section
193	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
194
195	# do plugin specific processing of doc_obj
196	print $outhandle "segment $segment - " if ($self->{'verbosity'});
197	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
198	if (!defined $status) {
199	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
200	if ($verbosity >= 2);
201	next;
202	}
203	# If the plugin returned 0, it threw away this part
204	if ($status == 0) {
205	next;
206	}
207	$count += $status;
208
209	# do any automatic metadata extraction
210	$self->auto_extract_metadata ($doc_obj);
211
212	# add an OID
213	$self->set_OID($doc_obj, $id, $segment);
214
215	# process the document
216	$processor->process($doc_obj);
217
218	$self->{'num_processed'} ++;
219	}
220
221	# Return number of document objects produced
222	return $count;
223	}
224
225	sub set_OID {
226	my $self = shift (@_);
227	my ($doc_obj, $id, $segment_number) = @_;
228
229	$doc_obj->set_OID($id . "s" . $segment_number);
230	}
231
232	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: