Context Navigation

source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago
Jeffrey's new parsing modifications, committed approx 6 July, 15.16
Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# SplitPlug.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitPlug is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40	package SplitPlug;
41
42	use BasPlug;
43	use gsprintf 'gsprintf';
44	use util;
45
46	# SplitPlug is a sub-class of BasPlug.
47	sub BEGIN {
48	@SplitPlug::ISA = ('BasPlug');
49	}
50
51
52	my $arguments =
53	[ { 'name' => "split_exp",
54	'desc' => "{SplitPlug.split_exp}",
55	'type' => "regexp",
56	#'deft' => &get_default_split_exp(),
57	'deft' => "",
58	'reqd' => "no" } ];
59
60	my $options = { 'name' => "SplitPlug",
61	'desc' => "{SplitPlug.desc}",
62	'abstract' => "yes",
63	'inherits' => "yes",
64	'args' => $arguments };
65
66
67	sub new {
68	my ($class) = shift (@_);
69	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
70	push(@$pluginlist, $class);
71
72	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
73	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
74
75	my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
76
77	$self->{'textcat_store'} = {};
78	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
79	return bless $self, $class;
80	}
81
82	sub init {
83	my $self = shift (@_);
84	my ($verbosity, $outhandle, $failhandle) = @_;
85
86	$self->BasPlug::init($verbosity, $outhandle, $failhandle);
87
88	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
89
90	$self->{'process_exp'} = $self->get_default_process_exp ();
91	if ($self->{'process_exp'} eq "") {
92	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
93	}
94	}
95
96
97	# set split_exp to default unless explicitly set
98	if (!$self->{'split_exp'}) {
99	$self->{'split_exp'} = $self->get_default_split_exp ();
100	}
101
102	}
103
104	# This plugin recurs over the segments it finds
105	sub is_recursive {
106	return 1;
107	}
108
109	# By default, we split the input text at blank lines
110	sub get_default_split_exp {
111	return q^\n\s*\n^;
112	}
113
114	sub metadata_read {
115	my $self = shift (@_);
116	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
117
118	my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
119	$metadata, $extrametakeys,
120	$extrametadata, $processor,
121	$maxdocs, $gli);
122	$split_matched = undef;
123
124	if ($matched) {
125
126	my $outhandle = $self->{'outhandle'};
127	my $filename = &util::filename_cat($base_dir, $file);
128
129	my $plugin_name = ref ($self);
130	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
131
132	$self->{'metapass_srcdoc'}->{$file} = {};
133
134	# Do encoding stuff
135	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
136	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
137	$self->{'textcat_store'}->{$file} = $le_rec;
138
139	# Read in file ($text will be in utf8)
140	my $text = "";
141	$self->read_file ($filename, $encoding, $language, \$text);
142
143	if ($text !~ /\w/) {
144	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
145	$file)
146	if $self->{'verbosity'};
147
148	my $failhandle = $self->{'failhandle'};
149	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
150	$self->{'num_not_processed'} ++;
151
152	$self->{'textcat_store'}->{$file} = undef;
153
154	return 0;
155	}
156
157
158	# Split the text into several smaller segments
159	my $split_exp = $self->{'split_exp'};
160	my @segments = split(/$split_exp/, $text);
161	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
162	if $self->{'verbosity'};
163
164	$self->{'split_segments'}->{$file} = \@segments;
165	$split_matched = scalar(@segments);
166	}
167
168	return $split_matched;
169	}
170
171
172
173	# The read function opens a file and splits it into parts.
174	# Each part is sent to the process function
175	#
176	# Returns: Number of document objects created (or undef if it fails)
177
178	sub read {
179	my $self = shift (@_);
180	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
181	my $outhandle = $self->{'outhandle'};
182	my $verbosity = $self->{'verbosity'};
183
184	# Figure out the exact filename of this file (and maybe block it)
185	my $filename = &util::filename_cat($base_dir, $file);
186	my $block_exp = $self->{'block_exp'};
187	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
188	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
189	return undef;
190	}
191	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
192
193	my $le_rec = $self->{'textcat_store'}->{$file};
194	if (!defined $le_rec) {
195	# means no text was found;
196	return 0; # not processed but no point in passing it on
197	}
198
199	my $language = $le_rec->{'language'};
200	my $encoding = $le_rec->{'encoding'};
201	$self->{'textcat_store'}->{$file} = undef;
202
203	my $segments = $self->{'split_segments'}->{$file};
204	$self->{'split_segments'}->{$file} = undef;
205
206	# Process each segment in turn
207	my ($count, $segment, $segtext, $status, $id);
208	$segment = 0;
209	$count = 0;
210	foreach $segtext (@$segments) {
211	$segment++;
212
213	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
214	# metadata is attached to a srcdoc
215	next;
216	}
217
218	# create a new document
219	my $doc_obj = new doc ($filename, "indexed_doc");
220	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
221	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
222	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
223	my ($filemeta) = $file =~ /([^\\\/]+)$/;
224	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
225	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
226	if ($self->{'cover_image'}) {
227	$self->associate_cover_image($doc_obj, $filename);
228	}
229	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
230	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
231
232	# Calculate a "base" document ID.
233	if (!defined $id) {
234	$doc_obj->set_OID();
235	$id = $doc_obj->get_OID();
236	}
237
238	# include any metadata passed in from previous plugins
239	# note that this metadata is associated with the top level section
240	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
241
242	# do plugin specific processing of doc_obj
243	print $outhandle "segment $segment - " if ($self->{'verbosity'});
244	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
245	if (!defined $status) {
246	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
247	if ($verbosity >= 2);
248	next;
249	}
250	# If the plugin returned 0, it threw away this part
251	if ($status == 0) {
252	next;
253	}
254	$count += $status;
255
256	# do any automatic metadata extraction
257	$self->auto_extract_metadata ($doc_obj);
258
259	# add an OID
260	$self->set_OID($doc_obj, $id, $segment);
261
262	# process the document
263	$processor->process($doc_obj);
264
265	$self->{'num_processed'} ++;
266	}
267
268	delete $self->{'metapass_srcdoc'}->{$file};
269
270	# Return number of document objects produced
271	return $count;
272	}
273
274	sub set_OID {
275	my $self = shift (@_);
276	my ($doc_obj, $id, $segment_number) = @_;
277
278	$doc_obj->set_OID($id . "s" . $segment_number);
279	}
280
281	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: