source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 11335

Last change on this file since 11335 was 11335, checked in by mdewsnip, 18 years ago

Now sends messages to the GLI telling the user the CDS/ISIS file is being processed.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.5 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use gsprintf 'gsprintf';
44use util;
45
46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
49# SplitPlug is a sub-class of BasPlug.
50sub BEGIN {
51 @SplitPlug::ISA = ('BasPlug');
52}
53
54
55my $arguments =
56 [ { 'name' => "split_exp",
57 'desc' => "{SplitPlug.split_exp}",
58 'type' => "regexp",
59 #'deft' => &get_default_split_exp(),
60 'deft' => "",
61 'reqd' => "no" } ];
62
63my $options = { 'name' => "SplitPlug",
64 'desc' => "{SplitPlug.desc}",
65 'abstract' => "yes",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
76 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
77
78 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
79
80 $self->{'textcat_store'} = {};
81 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 $self->BasPlug::init($verbosity, $outhandle, $failhandle);
90
91 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
92
93 $self->{'process_exp'} = $self->get_default_process_exp ();
94 if ($self->{'process_exp'} eq "") {
95 warn ref($self) . " Warning: plugin has no process_exp\n";
96 }
97 }
98
99
100 # set split_exp to default unless explicitly set
101 if (!$self->{'split_exp'}) {
102 $self->{'split_exp'} = $self->get_default_split_exp ();
103 }
104
105}
106
107# This plugin recurs over the segments it finds
108sub is_recursive {
109 return 1;
110}
111
112# By default, we split the input text at blank lines
113sub get_default_split_exp {
114 return q^\n\s*\n^;
115}
116
117sub metadata_read {
118 my $self = shift (@_);
119 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
120
121 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
122 $metadata, $extrametakeys,
123 $extrametadata, $processor,
124 $maxdocs, $gli);
125 my $split_matched = undef;
126
127 if ($matched) {
128
129 my $outhandle = $self->{'outhandle'};
130 my $filename = &util::filename_cat($base_dir, $file);
131
132 my $plugin_name = ref ($self);
133 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
134
135 $self->{'metapass_srcdoc'}->{$file} = {};
136
137 # Do encoding stuff
138 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
139 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
140 $self->{'textcat_store'}->{$file} = $le_rec;
141
142 # Read in file ($text will be in utf8)
143 my $text = "";
144 $self->read_file ($filename, $encoding, $language, \$text);
145
146 if ($text !~ /\w/) {
147 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n",
148 $file)
149 if $self->{'verbosity'};
150
151 my $failhandle = $self->{'failhandle'};
152 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
153 $self->{'num_not_processed'} ++;
154
155 $self->{'textcat_store'}->{$file} = undef;
156
157 return 0;
158 }
159
160
161 # Split the text into several smaller segments
162 my $split_exp = $self->{'split_exp'};
163 my @segments = split(/$split_exp/, $text);
164 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
165 if $self->{'verbosity'};
166
167 $self->{'split_segments'}->{$file} = \@segments;
168 $split_matched = scalar(@segments);
169 }
170
171 return $split_matched;
172}
173
174
175
176# The read function opens a file and splits it into parts.
177# Each part is sent to the process function
178#
179# Returns: Number of document objects created (or undef if it fails)
180
181sub read {
182 my $self = shift (@_);
183 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
184 my $outhandle = $self->{'outhandle'};
185 my $verbosity = $self->{'verbosity'};
186
187 #check process and block exps, smart block, etc
188 my ($block_status,$filename) = $self->read_block(@_);
189 return $block_status if ((!defined $block_status) || ($block_status==0));
190
191 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
192
193 my $le_rec = $self->{'textcat_store'}->{$file};
194 if (!defined $le_rec) {
195 # means no text was found;
196 return 0; # not processed but no point in passing it on
197 }
198
199 my $language = $le_rec->{'language'};
200 my $encoding = $le_rec->{'encoding'};
201 $self->{'textcat_store'}->{$file} = undef;
202
203 my $segments = $self->{'split_segments'}->{$file};
204 $self->{'split_segments'}->{$file} = undef;
205
206 # Process each segment in turn
207 my ($count, $segment, $segtext, $status, $id);
208 $segment = 0;
209 $count = 0;
210 foreach $segtext (@$segments) {
211 $segment++;
212
213 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
214 # metadata is attached to a srcdoc
215 next;
216 }
217
218 # create a new document
219 my $doc_obj = new doc ($filename, "indexed_doc");
220 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
221 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
222 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
223 my ($filemeta) = $file =~ /([^\\\/]+)$/;
224 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
225 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
226 if ($self->{'cover_image'}) {
227 $self->associate_cover_image($doc_obj, $filename);
228 }
229 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
230 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
231
232 # Calculate a "base" document ID.
233 if (!defined $id) {
234 $doc_obj->set_OID();
235 $id = $doc_obj->get_OID();
236 }
237
238 # include any metadata passed in from previous plugins
239 # note that this metadata is associated with the top level section
240 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
241
242 # do plugin specific processing of doc_obj
243 print $outhandle "segment $segment - " if ($self->{'verbosity'});
244 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
245 if (!defined $status) {
246 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
247 if ($verbosity >= 2);
248 next;
249 }
250 # If the plugin returned 0, it threw away this part
251 if ($status == 0) {
252 next;
253 }
254 $count += $status;
255
256 # do any automatic metadata extraction
257 $self->auto_extract_metadata ($doc_obj);
258
259 # add an OID
260 $self->set_OID($doc_obj, $id, $segment);
261
262 # process the document
263 $processor->process($doc_obj);
264
265 $self->{'num_processed'} ++;
266 }
267
268 delete $self->{'metapass_srcdoc'}->{$file};
269
270 # Return number of document objects produced
271 return $count;
272}
273
274sub set_OID {
275 my $self = shift (@_);
276 my ($doc_obj, $id, $segment_number) = @_;
277
278 $doc_obj->set_OID($id . "s" . $segment_number);
279}
280
2811;
Note: See TracBrowser for help on using the repository browser.