Context Navigation

source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 8.7 KB

Line
1	###########################################################################
2	#
3	# SplitPlug.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitPlug is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40	package SplitPlug;
41
42	use BasPlug;
43	use gsprintf 'gsprintf';
44	use util;
45
46	use strict;
47	no strict 'refs'; # allow filehandles to be variables and viceversa
48
49	# SplitPlug is a sub-class of BasPlug.
50	sub BEGIN {
51	@SplitPlug::ISA = ('BasPlug');
52	}
53
54
55	my $arguments =
56	[ { 'name' => "split_exp",
57	'desc' => "{SplitPlug.split_exp}",
58	'type' => "regexp",
59	#'deft' => &get_default_split_exp(),
60	'deft' => "",
61	'reqd' => "no" } ];
62
63	my $options = { 'name' => "SplitPlug",
64	'desc' => "{SplitPlug.desc}",
65	'abstract' => "yes",
66	'inherits' => "yes",
67	'args' => $arguments };
68
69
70	sub new {
71	my ($class) = shift (@_);
72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73	push(@$pluginlist, $class);
74
75	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
76	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
77
78	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
79
80	$self->{'textcat_store'} = {};
81	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82	return bless $self, $class;
83	}
84
85	sub init {
86	my $self = shift (@_);
87	my ($verbosity, $outhandle, $failhandle) = @_;
88
89	$self->BasPlug::init($verbosity, $outhandle, $failhandle);
90
91	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
92
93	$self->{'process_exp'} = $self->get_default_process_exp ();
94	if ($self->{'process_exp'} eq "") {
95	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
96	}
97	}
98
99
100	# set split_exp to default unless explicitly set
101	if (!$self->{'split_exp'}) {
102	$self->{'split_exp'} = $self->get_default_split_exp ();
103	}
104
105	}
106
107	# This plugin recurs over the segments it finds
108	sub is_recursive {
109	return 1;
110	}
111
112	# By default, we split the input text at blank lines
113	sub get_default_split_exp {
114	return q^\n\s*\n^;
115	}
116
117	sub metadata_read {
118	my $self = shift (@_);
119	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
120
121	my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
122	$metadata, $extrametakeys,
123	$extrametadata, $processor,
124	$maxdocs, $gli);
125	my $split_matched = undef;
126
127	if ($matched) {
128
129	my $outhandle = $self->{'outhandle'};
130	my $filename = &util::filename_cat($base_dir, $file);
131
132	my $plugin_name = ref ($self);
133	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
134
135	$self->{'metapass_srcdoc'}->{$file} = {};
136
137	# Do encoding stuff
138	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
139	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
140	$self->{'textcat_store'}->{$file} = $le_rec;
141
142	# Read in file ($text will be in utf8)
143	my $text = "";
144	$self->read_file ($filename, $encoding, $language, \$text);
145
146	if ($text !~ /\w/) {
147	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
148	$file)
149	if $self->{'verbosity'};
150
151	my $failhandle = $self->{'failhandle'};
152	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
153	$self->{'num_not_processed'} ++;
154
155	$self->{'textcat_store'}->{$file} = undef;
156
157	return 0;
158	}
159
160
161	# Split the text into several smaller segments
162	my $split_exp = $self->{'split_exp'};
163	my @segments = split(/$split_exp/, $text);
164	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
165	if $self->{'verbosity'};
166
167	$self->{'split_segments'}->{$file} = \@segments;
168	$split_matched = scalar(@segments);
169	}
170
171	return $split_matched;
172	}
173
174
175
176	# The read function opens a file and splits it into parts.
177	# Each part is sent to the process function
178	#
179	# Returns: Number of document objects created (or undef if it fails)
180
181	sub read {
182	my $self = shift (@_);
183	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
184	my $outhandle = $self->{'outhandle'};
185	my $verbosity = $self->{'verbosity'};
186
187	# Figure out the exact filename of this file (and maybe block it)
188	my $filename = &util::filename_cat($base_dir, $file);
189	my $block_exp = $self->{'block_exp'};
190	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
191	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
192	return undef;
193	}
194	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
195
196	my $le_rec = $self->{'textcat_store'}->{$file};
197	if (!defined $le_rec) {
198	# means no text was found;
199	return 0; # not processed but no point in passing it on
200	}
201
202	my $language = $le_rec->{'language'};
203	my $encoding = $le_rec->{'encoding'};
204	$self->{'textcat_store'}->{$file} = undef;
205
206	my $segments = $self->{'split_segments'}->{$file};
207	$self->{'split_segments'}->{$file} = undef;
208
209	# Process each segment in turn
210	my ($count, $segment, $segtext, $status, $id);
211	$segment = 0;
212	$count = 0;
213	foreach $segtext (@$segments) {
214	$segment++;
215
216	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
217	# metadata is attached to a srcdoc
218	next;
219	}
220
221	# create a new document
222	my $doc_obj = new doc ($filename, "indexed_doc");
223	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
224	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
225	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
226	my ($filemeta) = $file =~ /([^\\\/]+)$/;
227	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
228	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
229	if ($self->{'cover_image'}) {
230	$self->associate_cover_image($doc_obj, $filename);
231	}
232	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
233	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
234
235	# Calculate a "base" document ID.
236	if (!defined $id) {
237	$doc_obj->set_OID();
238	$id = $doc_obj->get_OID();
239	}
240
241	# include any metadata passed in from previous plugins
242	# note that this metadata is associated with the top level section
243	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
244
245	# do plugin specific processing of doc_obj
246	print $outhandle "segment $segment - " if ($self->{'verbosity'});
247	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
248	if (!defined $status) {
249	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
250	if ($verbosity >= 2);
251	next;
252	}
253	# If the plugin returned 0, it threw away this part
254	if ($status == 0) {
255	next;
256	}
257	$count += $status;
258
259	# do any automatic metadata extraction
260	$self->auto_extract_metadata ($doc_obj);
261
262	# add an OID
263	$self->set_OID($doc_obj, $id, $segment);
264
265	# process the document
266	$processor->process($doc_obj);
267
268	$self->{'num_processed'} ++;
269	}
270
271	delete $self->{'metapass_srcdoc'}->{$file};
272
273	# Return number of document objects produced
274	return $count;
275	}
276
277	sub set_OID {
278	my $self = shift (@_);
279	my ($doc_obj, $id, $segment_number) = @_;
280
281	$doc_obj->set_OID($id . "s" . $segment_number);
282	}
283
284	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: