Context Navigation

source: gsdl/trunk/perllib/plugins/SplitTextFile.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago
tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process
Property svn:keywords set to `Author Date Id Revision`
File size: 8.9 KB

Line
1	###########################################################################
2	#
3	# SplitTextFile.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitTextFile is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitTextFile. See ReferPlug for an example.
38
39
40	package SplitTextFile;
41
42	use ReadTextFile;
43	use gsprintf 'gsprintf';
44	use util;
45
46	use strict;
47	no strict 'refs'; # allow filehandles to be variables and viceversa
48
49	# SplitTextFile is a sub-class of BasPlug.
50	sub BEGIN {
51	@SplitTextFile::ISA = ('ReadTextFile');
52	}
53
54
55	my $arguments =
56	[ { 'name' => "split_exp",
57	'desc' => "{SplitTextFile.split_exp}",
58	'type' => "regexp",
59	#'deft' => &get_default_split_exp(),
60	'deft' => "",
61	'reqd' => "no" } ];
62
63	my $options = { 'name' => "SplitTextFile",
64	'desc' => "{SplitTextFile.desc}",
65	'abstract' => "yes",
66	'inherits' => "yes",
67	'args' => $arguments };
68
69
70	sub new {
71	my ($class) = shift (@_);
72	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73	push(@$pluginlist, $class);
74
75	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76	push(@{$hashArgOptLists->{"OptList"}},$options);
77
78	my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
79
80	$self->{'textcat_store'} = {};
81	$self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82	return bless $self, $class;
83	}
84
85	sub init {
86	my $self = shift (@_);
87	my ($verbosity, $outhandle, $failhandle) = @_;
88
89	$self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
90
91	# why is this is init and not in new??
92	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
93
94	$self->{'process_exp'} = $self->get_default_process_exp ();
95	if ($self->{'process_exp'} eq "") {
96	warn ref($self) . " Warning: plugin has no process_exp\n";
97	}
98	}
99
100
101	# set split_exp to default unless explicitly set
102	if (!$self->{'split_exp'}) {
103	$self->{'split_exp'} = $self->get_default_split_exp ();
104	}
105
106	}
107
108	# This plugin recurs over the segments it finds
109	sub is_recursive {
110	return 1;
111	}
112
113	# By default, we split the input text at blank lines
114	sub get_default_split_exp {
115	return q^\n\s*\n^;
116	}
117
118	sub metadata_read {
119	my $self = shift (@_);
120	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
121
122	# returns 1 if matches process_exp, and has done blocking in the meantime
123	my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
124	$metadata, $extrametakeys,
125	$extrametadata, $processor,
126	$maxdocs, $gli);
127	my $split_matched = undef;
128
129	if ($matched) {
130
131	my $outhandle = $self->{'outhandle'};
132	my $filename = &util::filename_cat($base_dir, $file);
133
134	my $plugin_name = ref ($self);
135	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
136
137	$self->{'metapass_srcdoc'}->{$file} = {};
138
139	# Do encoding stuff
140	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
141	my $le_rec = { 'language' => $language, 'encoding' => $encoding };
142	$self->{'textcat_store'}->{$file} = $le_rec;
143
144	# Read in file ($text will be in utf8)
145	my $text = "";
146	$self->read_file ($filename, $encoding, $language, \$text);
147
148
149	if ($text !~ /\w/) {
150	gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
151	$file)
152	if $self->{'verbosity'};
153
154	my $failhandle = $self->{'failhandle'};
155	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
156	$self->{'num_not_processed'} ++;
157
158	$self->{'textcat_store'}->{$file} = undef;
159
160	return 0;
161	}
162
163
164	# Split the text into several smaller segments
165	my $split_exp = $self->{'split_exp'};
166	my @tmp = split(/$split_exp/i, $text);
167	my @segments =();
168	## get rid of empty segments
169	foreach my $seg (@tmp){
170	if ($seg ne ""){
171	push @segments, $seg;
172	}
173	}
174
175	print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
176	if $self->{'verbosity'};
177
178	$self->{'split_segments'}->{$file} = \@segments;
179	$split_matched = scalar(@segments);
180	}
181
182	return $split_matched;
183	}
184
185
186
187	# The read function opens a file and splits it into parts.
188	# Each part is sent to the process function
189	#
190	# Returns: Number of document objects created (or undef if it fails)
191
192	sub read {
193	my $self = shift (@_);
194	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
195	my $outhandle = $self->{'outhandle'};
196	my $verbosity = $self->{'verbosity'};
197
198	#check process and block exps, smart block, etc
199	my ($block_status,$filename) = $self->read_block(@_);
200	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
201
202	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
203
204	my $le_rec = $self->{'textcat_store'}->{$file};
205	if (!defined $le_rec) {
206	# means no text was found;
207	return 0; # not processed but no point in passing it on
208	}
209
210	print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
211	print $outhandle "$self->{'plugin_type'} processing $file\n"
212	if $self->{'verbosity'} > 1;
213
214	my $language = $le_rec->{'language'};
215	my $encoding = $le_rec->{'encoding'};
216	$self->{'textcat_store'}->{$file} = undef;
217
218	my $segments = $self->{'split_segments'}->{$file};
219	$self->{'split_segments'}->{$file} = undef;
220
221	# Process each segment in turn
222	my ($count, $segment, $segtext, $status, $id);
223	$segment = 0;
224	$count = 0;
225	foreach $segtext (@$segments) {
226	$segment++;
227
228	if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
229	# metadata is attached to a srcdoc
230	next;
231	}
232
233	# create a new document
234	my $doc_obj = new doc ($filename, "indexed_doc");
235	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
236	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
237	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
238	my ($filemeta) = $file =~ /([^\\\/]+)$/;
239	$self->set_Source_metadata($doc_obj, $filemeta, $encoding);
240	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
241	if ($self->{'cover_image'}) {
242	$self->associate_cover_image($doc_obj, $filename);
243	}
244	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
245	#$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
246
247	# Calculate a "base" document ID.
248	if (!defined $id) {
249	$doc_obj->set_OID();
250	$id = $doc_obj->get_OID();
251	}
252
253	# include any metadata passed in from previous plugins
254	# note that this metadata is associated with the top level section
255	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
256
257	# do plugin specific processing of doc_obj
258	print $outhandle "segment $segment\n" if ($self->{'verbosity'});
259	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
260	if (!defined $status) {
261	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
262	if ($verbosity >= 2);
263	next;
264	}
265	# If the plugin returned 0, it threw away this part
266	if ($status == 0) {
267	next;
268	}
269	$count += $status;
270
271	# do any automatic metadata extraction
272	$self->auto_extract_metadata ($doc_obj);
273
274	# add an OID
275	$self->set_OID($doc_obj, $id, $segment);
276
277	# process the document
278	$processor->process($doc_obj);
279
280	$self->{'num_processed'} ++;
281	}
282
283	delete $self->{'metapass_srcdoc'}->{$file};
284
285	# Return number of document objects produced
286	return $count;
287	}
288
289	sub set_OID {
290	my $self = shift (@_);
291	my ($doc_obj, $id, $segment_number) = @_;
292
293	$doc_obj->set_OID($id . "s" . $segment_number);
294	}
295
296	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: