source: main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm@ 37179

Last change on this file since 37179 was 37179, checked in by davidb, 15 months ago

Only want to get OIDMetadata if OIDtype is assigned. Also adjusted full_id so it is more obivous that the ID that is being built does not have dots '.' in it

  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# SplitTextFile.pm
4# -- A plugin for splitting input files into segments that will then
5# be individually processed.
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitTextFile is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitTextFile. See ReferPlug for an example.
38
39
40package SplitTextFile;
41
42use ReadTextFile;
43use gsprintf 'gsprintf';
44use util;
45
46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
49# SplitTextFile is a sub-class of ReadTextFile
50sub BEGIN {
51 @SplitTextFile::ISA = ('ReadTextFile');
52}
53
54
55my $arguments =
56 [ { 'name' => "split_exp",
57 'desc' => "{SplitTextFile.split_exp}",
58 'type' => "regexp",
59 #'deft' => &get_default_split_exp(),
60 'deft' => "",
61 'reqd' => "no" } ];
62
63my $options = { 'name' => "SplitTextFile",
64 'desc' => "{SplitTextFile.desc}",
65 'abstract' => "yes",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
79
80 $self->{'textcat_store'} = {};
81 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
90
91 # why is this is init and not in new??
92 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
93
94 $self->{'process_exp'} = $self->get_default_process_exp ();
95 if ($self->{'process_exp'} eq "") {
96 warn ref($self) . " Warning: plugin has no process_exp\n";
97 }
98 }
99
100
101 # set split_exp to default unless explicitly set
102 if (!$self->{'split_exp'}) {
103 $self->{'split_exp'} = $self->get_default_split_exp ();
104 }
105
106}
107
108# This plugin recurs over the segments it finds
109sub is_recursive {
110 return 1;
111}
112
113# By default, we split the input text at blank lines
114sub get_default_split_exp {
115 return q^\n\s*\n^;
116}
117
118sub split_text_into_segments {
119 my $self = shift (@_);
120 my ($textref) = @_;
121
122
123 # Split the text into several smaller segments
124 my $split_exp = $self->{'split_exp'};
125 my @tmp = split(/$split_exp/i, $$textref);
126
127 my @segments =();
128 ## get rid of empty segments
129 foreach my $seg (@tmp){
130 if ($seg ne ""){
131 push @segments, $seg;
132 }
133 }
134
135 return \@segments;
136}
137
138sub metadata_read {
139 my $self = shift (@_);
140 my ($pluginfo, $base_dir, $file, $block_hash,
141 $extrametakeys, $extrametadata, $extrametafile,
142 $processor, $gli, $aux) = @_;
143
144 # can we process this file??
145 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
146
147 return undef unless $self->can_process_this_file($filename_full_path);
148
149 my $outhandle = $self->{'outhandle'};
150 my $filename = &util::filename_cat($base_dir, $file);
151
152 my $plugin_name = ref ($self);
153 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
154
155 $self->{'metapass_srcdoc'}->{$file} = {};
156
157 # Do encoding stuff
158 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
159 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
160 $self->{'textcat_store'}->{$file} = $le_rec;
161
162 # Read in file ($text will be in utf8)
163 my $text = "";
164 $self->read_file ($filename, $encoding, $language, \$text);
165
166
167 if ($text !~ /\w/) {
168 gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
169 $file)
170 if $self->{'verbosity'};
171
172 my $failhandle = $self->{'failhandle'};
173 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
174 $self->{'num_not_processed'} ++;
175
176 $self->{'textcat_store'}->{$file} = undef;
177
178 return 0;
179 }
180
181
182 # Split the text into several smaller segments
183# my $split_exp = $self->{'split_exp'};
184# my @tmp = split(/$split_exp/i, $text);
185# my @segments =();
186# ## get rid of empty segments
187# foreach my $seg (@tmp){
188# if ($seg ne ""){
189# push @segments, $seg;
190# }
191# }
192#
193# print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
194# if $self->{'verbosity'};
195#
196# $self->{'split_segments'}->{$file} = \@segments;
197# return scalar(@segments);
198
199 my $segments = $self->split_text_into_segments(\$text);
200
201 my $num_segments = scalar(@$segments);
202
203 print $outhandle "SplitTextFile found $num_segments documents in $filename\n"
204 if $self->{'verbosity'};
205
206 $self->{'split_segments'}->{$file} = $segments;
207
208 return $num_segments;
209}
210
211
212
213# The read function opens a file and splits it into parts.
214# Each part is sent to the process function
215#
216# Returns: Number of document objects created (or undef if it fails)
217
218sub read {
219 my $self = shift (@_);
220 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
221 my $outhandle = $self->{'outhandle'};
222 my $verbosity = $self->{'verbosity'};
223
224 # can we process this file??
225 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
226 return undef unless $self->can_process_this_file($filename_full_path);
227
228 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
229
230 my $le_rec = $self->{'textcat_store'}->{$file};
231 if (!defined $le_rec) {
232 # means no text was found;
233 return 0; # not processed but no point in passing it on
234 }
235
236 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
237 print $outhandle "$self->{'plugin_type'} processing $file\n"
238 if $self->{'verbosity'} > 1;
239
240 my $language = $le_rec->{'language'};
241 my $encoding = $le_rec->{'encoding'};
242 $self->{'textcat_store'}->{$file} = undef;
243
244 my $segments = $self->{'split_segments'}->{$file};
245 $self->{'split_segments'}->{$file} = undef;
246
247 # Process each segment in turn
248 my ($count, $segment, $segtext, $status, $id);
249 $segment = 0;
250 $count = 0;
251 foreach $segtext (@$segments) {
252 $segment++;
253 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
254 # metadata is attached to a srcdoc
255 next;
256 }
257
258 # create a new document
259 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
260 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
261 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
262
263 my ($filemeta) = $file =~ /([^\\\/]+)$/;
264 my $plugin_filename_encoding = $self->{'filename_encoding'};
265 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
266 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
267
268 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
269 if ($self->{'cover_image'}) {
270 $self->associate_cover_image($doc_obj, $filename_full_path);
271 }
272 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
273 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
274
275# # include any metadata passed in from previous plugins
276# # note that this metadata is associated with the top level section
277# $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
278
279# # Calculate a "base" document ID.
280# if (!defined $id) {
281# $id = $self->get_base_OID($doc_obj);
282# }
283
284 # include any metadata passed in from previous plugins
285 # note that this metadata is associated with the top level section
286 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
287
288 # do plugin specific processing of doc_obj
289 print $outhandle "segment $segment\n" if ($self->{'verbosity'});
290 print STDERR "<Processing s='$segment' n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
291 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
292 if (!defined $status) {
293 print $outhandle "WARNING: no plugin could process segment $segment of $file\n"
294 if ($verbosity >= 2);
295 print STDERR "<ProcessingError s='$segment' n='$file'>\n" if $gli;
296 next;
297 }
298 # If the plugin returned 0, it threw away this part
299 if ($status == 0) {
300 next;
301 }
302 $count += $status;
303
304 # do any automatic metadata extraction
305 $self->auto_extract_metadata ($doc_obj);
306
307 # This used to be done earlier on in routine, however $id generated
308 # isn't used until here!
309 # Calculate a "base" document ID.
310 if (!defined $id) {
311 $id = $self->get_base_OID($doc_obj);
312 }
313
314 # add an OID
315 $self->add_segment_OID($doc_obj, $id, $segment);
316
317 # process the document
318 $processor->process($doc_obj);
319
320 $self->{'num_processed'} ++;
321
322 if ($maxdocs != -1 && $self->{'num_processed'} >= $maxdocs) {
323 last;
324 }
325 }
326
327 delete $self->{'metapass_srcdoc'}->{$file};
328
329 # Return number of document objects produced
330 return $count;
331}
332
333sub get_base_OID {
334 my $self = shift(@_);
335 my ($doc_obj) = @_;
336
337 if ($self->{'OIDtype'} eq "assigned") {
338 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
339 # print STDERR "**** get_base_OID() assigned identifier = $identifier\n";
340 }
341
342 $self->SUPER::add_OID($doc_obj);
343 return $doc_obj->get_OID();
344}
345
346sub add_segment_OID {
347 my $self = shift (@_);
348 my ($doc_obj, $id, $segment) = @_;
349
350 my $full_id = "${id}s${segment}";
351 if ($self->{'OIDtype'} eq "assigned") {
352 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
353 if (defined $identifier && $identifier ne "") {
354 $full_id = $identifier;
355 $full_id = &util::tidy_up_oid($full_id);
356 }
357 }
358 $doc_obj->set_OID($full_id);
359}
360
361
3621;
Note: See TracBrowser for help on using the repository browser.