Context Navigation

SplitTextFile.pm--for-gs311@ 36297

Last change on this file since 36297 was 34878, checked in by davidb, 3 years ago
Further changes to work more smoothly with JSONSparqlResultsPlugin, which makes use of OIDtype metadata for each record to provide nice IDs
Property svn:keywords set to `Author Date Id Revision`
File size: 10.9 KB

Line
1	###########################################################################
2	#
3	# SplitTextFile.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitTextFile is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitTextFile. See ReferPlug for an example.
38
39
40	package SplitTextFile;
41
42	use ReadTextFile;
43	use gsprintf 'gsprintf';
44	use util;
45
46	use strict;
47	no strict 'refs'; # allow filehandles to be variables and viceversa
48
49	# SplitTextFile is a sub-class of ReadTextFile
50	sub BEGIN {
51	@SplitTextFile::ISA = ('ReadTextFile');
52	}
53
54
55	my $arguments =
56	[ { 'name' => "split_exp",
57	'desc' => "{SplitTextFile.split_exp}",
58	'type' => "regexp",
59	#'deft' => &get_default_split_exp(),
60	'deft' => "",
61	'reqd' => "no" } ];
62
63	my $options = { 'name' => "SplitTextFile",
64	'desc' => "{SplitTextFile.desc}",
65	'abstract' => "yes",
66	'inherits' => "yes",
67	'args' => $arguments };
68
69
70	sub new {
71	my ($class) = shift (@_);
72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73	push(@$pluginlist, $class);
74
75	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76	push(@{$hashArgOptLists->{"OptList"}},$options);
77
78	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
79
80	$self->{'textcat_store'} = {};
81	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82	return bless $self, $class;
83	}
84
85	sub init {
86	my $self = shift (@_);
87	my ($verbosity, $outhandle, $failhandle) = @_;
88
89	$self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
90
91	# why is this is init and not in new??
92	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
93
94	$self->{'process_exp'} = $self->get_default_process_exp ();
95	if ($self->{'process_exp'} eq "") {
96	warn ref($self) . " Warning: plugin has no process_exp\n";
97	}
98	}
99
100
101	# set split_exp to default unless explicitly set
102	if (!$self->{'split_exp'}) {
103	$self->{'split_exp'} = $self->get_default_split_exp ();
104	}
105
106	}
107
108	# This plugin recurs over the segments it finds
109	sub is_recursive {
110	return 1;
111	}
112
113	# By default, we split the input text at blank lines
114	sub get_default_split_exp {
115	return q^\n\s*\n^;
116	}
117
118	sub split_text_into_segments {
119	my $self = shift (@_);
120	my ($textref) = @_;
121
122
123	# Split the text into several smaller segments
124	my $split_exp = $self->{'split_exp'};
125	my @tmp = split(/$split_exp/i, $$textref);
126
127	my @segments =();
128	## get rid of empty segments
129	foreach my $seg (@tmp){
130	if ($seg ne ""){
131	push @segments, $seg;
132	}
133	}
134
135	return \@segments;
136	}
137
138	sub metadata_read {
139	my $self = shift (@_);
140	my ($pluginfo, $base_dir, $file, $block_hash,
141	$extrametakeys, $extrametadata, $extrametafile,
142	$processor, $gli, $aux) = @_;
143
144	# can we process this file??
145	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
146
147	return undef unless $self->can_process_this_file($filename_full_path);
148
149	my $outhandle = $self->{'outhandle'};
150	my $filename = &util::filename_cat($base_dir, $file);
151
152	my $plugin_name = ref ($self);
153	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
154
155	$self->{'metapass_srcdoc'}->{$file} = {};
156
157	# Do encoding stuff
158	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
159	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
160	$self->{'textcat_store'}->{$file} = $le_rec;
161
162	# Read in file ($text will be in utf8)
163	my $text = "";
164	$self->read_file ($filename, $encoding, $language, \$text);
165
166
167	if ($text !~ /\w/) {
168	gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
169	$file)
170	if $self->{'verbosity'};
171
172	my $failhandle = $self->{'failhandle'};
173	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
174	$self->{'num_not_processed'} ++;
175
176	$self->{'textcat_store'}->{$file} = undef;
177
178	return 0;
179	}
180
181
182	# Split the text into several smaller segments
183	# my $split_exp = $self->{'split_exp'};
184	# my @tmp = split(/$split_exp/i, $text);
185	# my @segments =();
186	# ## get rid of empty segments
187	# foreach my $seg (@tmp){
188	# if ($seg ne ""){
189	# push @segments, $seg;
190	# }
191	# }
192	#
193	# print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
194	# if $self->{'verbosity'};
195	#
196	# $self->{'split_segments'}->{$file} = \@segments;
197	# return scalar(@segments);
198
199	my $segments = $self->split_text_into_segments(\$text);
200
201	my $num_segments = scalar(@$segments);
202
203	print $outhandle "SplitTextFile found $num_segments documents in $filename\n"
204	if $self->{'verbosity'};
205
206	$self->{'split_segments'}->{$file} = $segments;
207
208	return $num_segments;
209	}
210
211
212
213	# The read function opens a file and splits it into parts.
214	# Each part is sent to the process function
215	#
216	# Returns: Number of document objects created (or undef if it fails)
217
218	sub read {
219	my $self = shift (@_);
220	my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
221	my $outhandle = $self->{'outhandle'};
222	my $verbosity = $self->{'verbosity'};
223
224	# can we process this file??
225	my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
226	return undef unless $self->can_process_this_file($filename_full_path);
227
228	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
229
230	my $le_rec = $self->{'textcat_store'}->{$file};
231	if (!defined $le_rec) {
232	# means no text was found;
233	return 0; # not processed but no point in passing it on
234	}
235
236	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
237	print $outhandle "$self->{'plugin_type'} processing $file\n"
238	if $self->{'verbosity'} > 1;
239
240	my $language = $le_rec->{'language'};
241	my $encoding = $le_rec->{'encoding'};
242	$self->{'textcat_store'}->{$file} = undef;
243
244	my $segments = $self->{'split_segments'}->{$file};
245	$self->{'split_segments'}->{$file} = undef;
246
247	# Process each segment in turn
248	my ($count, $segment, $segtext, $status, $id);
249	$segment = 0;
250	$count = 0;
251	foreach $segtext (@$segments) {
252	$segment++;
253
254	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
255	# metadata is attached to a srcdoc
256	next;
257	}
258
259	# create a new document
260	my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
261	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
262	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
263
264	my ($filemeta) = $file =~ /([^\\\/]+)$/;
265	my $plugin_filename_encoding = $self->{'filename_encoding'};
266	my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
267	$self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
268
269	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
270	if ($self->{'cover_image'}) {
271	$self->associate_cover_image($doc_obj, $filename_full_path);
272	}
273	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
274	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
275
276	# # include any metadata passed in from previous plugins
277	# # note that this metadata is associated with the top level section
278	# $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
279
280	# # Calculate a "base" document ID.
281	# if (!defined $id) {
282	# $id = $self->get_base_OID($doc_obj);
283	# }
284
285	# include any metadata passed in from previous plugins
286	# note that this metadata is associated with the top level section
287	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
288
289	# do plugin specific processing of doc_obj
290	print $outhandle "segment $segment\n" if ($self->{'verbosity'});
291	print STDERR "<Processing s='$segment' n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
292	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
293	if (!defined $status) {
294	print $outhandle "WARNING: no plugin could process segment $segment of $file\n"
295	if ($verbosity >= 2);
296	print STDERR "<ProcessingError s='$segment' n='$file'>\n" if $gli;
297	next;
298	}
299	# If the plugin returned 0, it threw away this part
300	if ($status == 0) {
301	next;
302	}
303	$count += $status;
304
305	# do any automatic metadata extraction
306	$self->auto_extract_metadata ($doc_obj);
307
308	# This used to be done earlier on in routine, however $id generated
309	# isn't used until here!
310	# Calculate a "base" document ID.
311	if (!defined $id) {
312	$id = $self->get_base_OID($doc_obj);
313	}
314
315	# add an OID
316	$self->add_segment_OID($doc_obj, $id, $segment);
317
318	# process the document
319	$processor->process($doc_obj);
320
321	$self->{'num_processed'} ++;
322
323	if ($self->{'num_processed'} >= $maxdocs) {
324	last;
325	}
326	}
327
328	delete $self->{'metapass_srcdoc'}->{$file};
329
330	# Return number of document objects produced
331	return $count;
332	}
333
334	sub get_base_OID {
335	my $self = shift(@_);
336	my ($doc_obj) = @_;
337
338	my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
339	# print STDERR "**** get_baseOID identifier = $identifier\n";
340
341	$self->SUPER::add_OID($doc_obj);
342	return $doc_obj->get_OID();
343	}
344
345	sub add_segment_OID {
346	my $self = shift (@_);
347	my ($doc_obj, $id, $segment) = @_;
348
349	my $full_id = $id . "s" . $segment;
350	if ($self->{'OIDtype'} eq "assigned") {
351	my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
352	if (defined $identifier && $identifier ne "") {
353	$full_id = $identifier;
354	$full_id = &util::tidy_up_oid($full_id);
355	}
356	}
357	$doc_obj->set_OID($full_id);
358	}
359
360
361	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format