source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use gsprintf 'gsprintf';
44use util;
45
46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
49# SplitPlug is a sub-class of BasPlug.
50sub BEGIN {
51 @SplitPlug::ISA = ('BasPlug');
52}
53
54
55my $arguments =
56 [ { 'name' => "split_exp",
57 'desc' => "{SplitPlug.split_exp}",
58 'type' => "regexp",
59 #'deft' => &get_default_split_exp(),
60 'deft' => "",
61 'reqd' => "no" } ];
62
63my $options = { 'name' => "SplitPlug",
64 'desc' => "{SplitPlug.desc}",
65 'abstract' => "yes",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
76 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
77
78 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
79
80 $self->{'textcat_store'} = {};
81 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 $self->BasPlug::init($verbosity, $outhandle, $failhandle);
90
91 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
92
93 $self->{'process_exp'} = $self->get_default_process_exp ();
94 if ($self->{'process_exp'} eq "") {
95 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
96 }
97 }
98
99
100 # set split_exp to default unless explicitly set
101 if (!$self->{'split_exp'}) {
102 $self->{'split_exp'} = $self->get_default_split_exp ();
103 }
104
105}
106
107# This plugin recurs over the segments it finds
108sub is_recursive {
109 return 1;
110}
111
112# By default, we split the input text at blank lines
113sub get_default_split_exp {
114 return q^\n\s*\n^;
115}
116
117sub metadata_read {
118 my $self = shift (@_);
119 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
120
121 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
122 $metadata, $extrametakeys,
123 $extrametadata, $processor,
124 $maxdocs, $gli);
125 my $split_matched = undef;
126
127 if ($matched) {
128
129 my $outhandle = $self->{'outhandle'};
130 my $filename = &util::filename_cat($base_dir, $file);
131
132 my $plugin_name = ref ($self);
133 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
134
135 $self->{'metapass_srcdoc'}->{$file} = {};
136
137 # Do encoding stuff
138 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
139 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
140 $self->{'textcat_store'}->{$file} = $le_rec;
141
142 # Read in file ($text will be in utf8)
143 my $text = "";
144 $self->read_file ($filename, $encoding, $language, \$text);
145
146 if ($text !~ /\w/) {
147 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
148 $file)
149 if $self->{'verbosity'};
150
151 my $failhandle = $self->{'failhandle'};
152 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
153 $self->{'num_not_processed'} ++;
154
155 $self->{'textcat_store'}->{$file} = undef;
156
157 return 0;
158 }
159
160
161 # Split the text into several smaller segments
162 my $split_exp = $self->{'split_exp'};
163 my @segments = split(/$split_exp/, $text);
164 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
165 if $self->{'verbosity'};
166
167 $self->{'split_segments'}->{$file} = \@segments;
168 $split_matched = scalar(@segments);
169 }
170
171 return $split_matched;
172}
173
174
175
176# The read function opens a file and splits it into parts.
177# Each part is sent to the process function
178#
179# Returns: Number of document objects created (or undef if it fails)
180
181sub read {
182 my $self = shift (@_);
183 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
184 my $outhandle = $self->{'outhandle'};
185 my $verbosity = $self->{'verbosity'};
186
187 # Figure out the exact filename of this file (and maybe block it)
188 my $filename = &util::filename_cat($base_dir, $file);
189 my $block_exp = $self->{'block_exp'};
190 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
191 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
192 return undef;
193 }
194 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
195
196 my $le_rec = $self->{'textcat_store'}->{$file};
197 if (!defined $le_rec) {
198 # means no text was found;
199 return 0; # not processed but no point in passing it on
200 }
201
202 my $language = $le_rec->{'language'};
203 my $encoding = $le_rec->{'encoding'};
204 $self->{'textcat_store'}->{$file} = undef;
205
206 my $segments = $self->{'split_segments'}->{$file};
207 $self->{'split_segments'}->{$file} = undef;
208
209 # Process each segment in turn
210 my ($count, $segment, $segtext, $status, $id);
211 $segment = 0;
212 $count = 0;
213 foreach $segtext (@$segments) {
214 $segment++;
215
216 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
217 # metadata is attached to a srcdoc
218 next;
219 }
220
221 # create a new document
222 my $doc_obj = new doc ($filename, "indexed_doc");
223 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
224 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
225 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
226 my ($filemeta) = $file =~ /([^\\\/]+)$/;
227 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
228 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
229 if ($self->{'cover_image'}) {
230 $self->associate_cover_image($doc_obj, $filename);
231 }
232 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
233 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
234
235 # Calculate a "base" document ID.
236 if (!defined $id) {
237 $doc_obj->set_OID();
238 $id = $doc_obj->get_OID();
239 }
240
241 # include any metadata passed in from previous plugins
242 # note that this metadata is associated with the top level section
243 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
244
245 # do plugin specific processing of doc_obj
246 print $outhandle "segment $segment - " if ($self->{'verbosity'});
247 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
248 if (!defined $status) {
249 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
250 if ($verbosity >= 2);
251 next;
252 }
253 # If the plugin returned 0, it threw away this part
254 if ($status == 0) {
255 next;
256 }
257 $count += $status;
258
259 # do any automatic metadata extraction
260 $self->auto_extract_metadata ($doc_obj);
261
262 # add an OID
263 $self->set_OID($doc_obj, $id, $segment);
264
265 # process the document
266 $processor->process($doc_obj);
267
268 $self->{'num_processed'} ++;
269 }
270
271 delete $self->{'metapass_srcdoc'}->{$file};
272
273 # Return number of document objects produced
274 return $count;
275}
276
277sub set_OID {
278 my $self = shift (@_);
279 my ($doc_obj, $id, $segment_number) = @_;
280
281 $doc_obj->set_OID($id . "s" . $segment_number);
282}
283
2841;
Note: See TracBrowser for help on using the repository browser.