source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 9853

Last change on this file since 9853 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use gsprintf 'gsprintf';
44use util;
45
46# SplitPlug is a sub-class of BasPlug.
47sub BEGIN {
48@SplitPlug::ISA = ('BasPlug');
49}
50
51
52my $arguments =
53 [ { 'name' => "split_exp",
54 'desc' => "{SplitPlug.split_exp}",
55 'type' => "regexp",
56 'deft' => &get_default_split_exp(),
57 'reqd' => "no" } ];
58
59my $options = { 'name' => "SplitPlug",
60 'desc' => "{SplitPlug.desc}",
61 'abstract' => "yes",
62 'inherits' => "yes",
63 'args' => $arguments };
64
65
66sub new {
67 my ($class) = @_;
68 $self = new BasPlug($class, @_);
69
70 $self->{'plugin_type'} = "SplitPlug";
71
72 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
73 my $option_list = $self->{'option_list'};
74 push( @{$option_list}, $options );
75
76 if (!parsargv::parse(\@_,
77 q^split_exp/.*/^, \$self->{'split_exp'},
78 "allow_extra_options")) {
79 print STDERR "\nIncorrect options passed to $class.";
80 print STDERR "\nCheck your collect.cfg configuration file\n";
81 die "\n";
82 }
83
84 $self->{'textcat_store'} = {};
85 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
86 return bless $self, $class;
87}
88
89sub init {
90 my $self = shift (@_);
91 my ($verbosity, $outhandle, $failhandle) = @_;
92
93 $self->BasPlug::init($verbosity, $outhandle, $failhandle);
94
95 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
96
97 $self->{'process_exp'} = $self->get_default_process_exp ();
98 if ($self->{'process_exp'} eq "") {
99 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
100 }
101 }
102
103
104 # set split_exp to default unless explicitly set
105 if (!$self->{'split_exp'}) {
106 $self->{'split_exp'} = $self->get_default_split_exp ();
107 }
108
109}
110
111# This plugin recurs over the segments it finds
112sub is_recursive {
113 return 1;
114}
115
116# By default, we split the input text at blank lines
117sub get_default_split_exp {
118 return q^\n\s*\n^;
119}
120
121sub metadata_read {
122 my $self = shift (@_);
123 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
124
125 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
126 $metadata, $extrametakeys,
127 $extrametadata, $processor,
128 $maxdocs, $gli);
129 $split_matched = undef;
130
131 if ($matched) {
132
133 my $outhandle = $self->{'outhandle'};
134 my $filename = &util::filename_cat($base_dir, $file);
135
136 my $plugin_name = ref ($self);
137 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
138
139 $self->{'metapass_srcdoc'}->{$file} = {};
140
141 # Do encoding stuff
142 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
143 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
144 $self->{'textcat_store'}->{$file} = $le_rec;
145
146 # Read in file ($text will be in utf8)
147 my $text = "";
148 $self->read_file ($filename, $encoding, $language, \$text);
149
150 if ($text !~ /\w/) {
151 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
152 $file)
153 if $self->{'verbosity'};
154
155 my $failhandle = $self->{'failhandle'};
156 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
157 $self->{'num_not_processed'} ++;
158
159 $self->{'textcat_store'}->{$file} = undef;
160
161 return 0;
162 }
163
164
165 # Split the text into several smaller segments
166 my $split_exp = $self->{'split_exp'};
167 my @segments = split(/$split_exp/, $text);
168 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
169 if $self->{'verbosity'};
170
171 $self->{'split_segments'}->{$file} = \@segments;
172 $split_matched = scalar(@segments);
173 }
174
175 return $split_matched;
176}
177
178
179
180# The read function opens a file and splits it into parts.
181# Each part is sent to the process function
182#
183# Returns: Number of document objects created (or undef if it fails)
184
185sub read {
186 my $self = shift (@_);
187 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
188 my $outhandle = $self->{'outhandle'};
189 my $verbosity = $self->{'verbosity'};
190
191 # Figure out the exact filename of this file (and maybe block it)
192 my $filename = &util::filename_cat($base_dir, $file);
193 my $block_exp = $self->{'block_exp'};
194 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
195 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
196 return undef;
197 }
198 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
199
200 my $le_rec = $self->{'textcat_store'}->{$file};
201 if (!defined $le_rec) {
202 # means no text was found;
203 return 0; # not processed but no point in passing it on
204 }
205
206 my $language = $le_rec->{'language'};
207 my $encoding = $le_rec->{'encoding'};
208 $self->{'textcat_store'}->{$file} = undef;
209
210 my $segments = $self->{'split_segments'}->{$file};
211 $self->{'split_segments'}->{$file} = undef;
212
213 # Process each segment in turn
214 my ($count, $segment, $segtext, $status, $id);
215 $segment = 0;
216 $count = 0;
217 foreach $segtext (@$segments) {
218 $segment++;
219
220 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
221 # metadata is attached to a srcdoc
222 next;
223 }
224
225 # create a new document
226 my $doc_obj = new doc ($filename, "indexed_doc");
227 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
228 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
229 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
230 my ($filemeta) = $file =~ /([^\\\/]+)$/;
231 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
232 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
233 if ($self->{'cover_image'}) {
234 $self->associate_cover_image($doc_obj, $filename);
235 }
236 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
237 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
238
239 # Calculate a "base" document ID.
240 if (!defined $id) {
241 $doc_obj->set_OID();
242 $id = $doc_obj->get_OID();
243 }
244
245 # include any metadata passed in from previous plugins
246 # note that this metadata is associated with the top level section
247 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
248
249 # do plugin specific processing of doc_obj
250 print $outhandle "segment $segment - " if ($self->{'verbosity'});
251 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
252 if (!defined $status) {
253 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
254 if ($verbosity >= 2);
255 next;
256 }
257 # If the plugin returned 0, it threw away this part
258 if ($status == 0) {
259 next;
260 }
261 $count += $status;
262
263 # do any automatic metadata extraction
264 $self->auto_extract_metadata ($doc_obj);
265
266 # add an OID
267 $self->set_OID($doc_obj, $id, $segment);
268
269 # process the document
270 $processor->process($doc_obj);
271
272 $self->{'num_processed'} ++;
273 }
274
275 delete $self->{'metapass_srcdoc'}->{$file};
276
277 # Return number of document objects produced
278 return $count;
279}
280
281sub set_OID {
282 my $self = shift (@_);
283 my ($doc_obj, $id, $segment_number) = @_;
284
285 $doc_obj->set_OID($id . "s" . $segment_number);
286}
287
2881;
Note: See TracBrowser for help on using the repository browser.