source: gsdl/trunk/perllib/plugins/SplitTextFile.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
Line 
1###########################################################################
2#
3# SplitTextFile.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitTextFile is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitTextFile. See ReferPlug for an example.
38
39
40package SplitTextFile;
41
42use ReadTextFile;
43use gsprintf 'gsprintf';
44use util;
45
46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
49# SplitTextFile is a sub-class of BasPlug.
50sub BEGIN {
51 @SplitTextFile::ISA = ('ReadTextFile');
52}
53
54
55my $arguments =
56 [ { 'name' => "split_exp",
57 'desc' => "{SplitTextFile.split_exp}",
58 'type' => "regexp",
59 #'deft' => &get_default_split_exp(),
60 'deft' => "",
61 'reqd' => "no" } ];
62
63my $options = { 'name' => "SplitTextFile",
64 'desc' => "{SplitTextFile.desc}",
65 'abstract' => "yes",
66 'inherits' => "yes",
67 'args' => $arguments };
68
69
70sub new {
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
79
80 $self->{'textcat_store'} = {};
81 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
90
91 # why is this is init and not in new??
92 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
93
94 $self->{'process_exp'} = $self->get_default_process_exp ();
95 if ($self->{'process_exp'} eq "") {
96 warn ref($self) . " Warning: plugin has no process_exp\n";
97 }
98 }
99
100
101 # set split_exp to default unless explicitly set
102 if (!$self->{'split_exp'}) {
103 $self->{'split_exp'} = $self->get_default_split_exp ();
104 }
105
106}
107
108# This plugin recurs over the segments it finds
109sub is_recursive {
110 return 1;
111}
112
113# By default, we split the input text at blank lines
114sub get_default_split_exp {
115 return q^\n\s*\n^;
116}
117
118sub metadata_read {
119 my $self = shift (@_);
120 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
121
122 # returns 1 if matches process_exp, and has done blocking in the meantime
123 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
124 $metadata, $extrametakeys,
125 $extrametadata, $processor,
126 $maxdocs, $gli);
127 my $split_matched = undef;
128
129 if ($matched) {
130
131 my $outhandle = $self->{'outhandle'};
132 my $filename = &util::filename_cat($base_dir, $file);
133
134 my $plugin_name = ref ($self);
135 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
136
137 $self->{'metapass_srcdoc'}->{$file} = {};
138
139 # Do encoding stuff
140 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
141 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
142 $self->{'textcat_store'}->{$file} = $le_rec;
143
144 # Read in file ($text will be in utf8)
145 my $text = "";
146 $self->read_file ($filename, $encoding, $language, \$text);
147
148
149 if ($text !~ /\w/) {
150 gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
151 $file)
152 if $self->{'verbosity'};
153
154 my $failhandle = $self->{'failhandle'};
155 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
156 $self->{'num_not_processed'} ++;
157
158 $self->{'textcat_store'}->{$file} = undef;
159
160 return 0;
161 }
162
163
164 # Split the text into several smaller segments
165 my $split_exp = $self->{'split_exp'};
166 my @tmp = split(/$split_exp/i, $text);
167 my @segments =();
168 ## get rid of empty segments
169 foreach my $seg (@tmp){
170 if ($seg ne ""){
171 push @segments, $seg;
172 }
173 }
174
175 print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
176 if $self->{'verbosity'};
177
178 $self->{'split_segments'}->{$file} = \@segments;
179 $split_matched = scalar(@segments);
180 }
181
182 return $split_matched;
183}
184
185
186
187# The read function opens a file and splits it into parts.
188# Each part is sent to the process function
189#
190# Returns: Number of document objects created (or undef if it fails)
191
192sub read {
193 my $self = shift (@_);
194 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
195 my $outhandle = $self->{'outhandle'};
196 my $verbosity = $self->{'verbosity'};
197
198 #check process and block exps, smart block, etc
199 my ($block_status,$filename) = $self->read_block(@_);
200 return $block_status if ((!defined $block_status) || ($block_status==0));
201
202 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
203
204 my $le_rec = $self->{'textcat_store'}->{$file};
205 if (!defined $le_rec) {
206 # means no text was found;
207 return 0; # not processed but no point in passing it on
208 }
209
210 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
211 print $outhandle "$self->{'plugin_type'} processing $file\n"
212 if $self->{'verbosity'} > 1;
213
214 my $language = $le_rec->{'language'};
215 my $encoding = $le_rec->{'encoding'};
216 $self->{'textcat_store'}->{$file} = undef;
217
218 my $segments = $self->{'split_segments'}->{$file};
219 $self->{'split_segments'}->{$file} = undef;
220
221 # Process each segment in turn
222 my ($count, $segment, $segtext, $status, $id);
223 $segment = 0;
224 $count = 0;
225 foreach $segtext (@$segments) {
226 $segment++;
227
228 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
229 # metadata is attached to a srcdoc
230 next;
231 }
232
233 # create a new document
234 my $doc_obj = new doc ($filename, "indexed_doc");
235 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
236 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
237 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
238 my ($filemeta) = $file =~ /([^\\\/]+)$/;
239 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
240 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
241 if ($self->{'cover_image'}) {
242 $self->associate_cover_image($doc_obj, $filename);
243 }
244 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
245 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
246
247 # Calculate a "base" document ID.
248 if (!defined $id) {
249 $doc_obj->set_OID();
250 $id = $doc_obj->get_OID();
251 }
252
253 # include any metadata passed in from previous plugins
254 # note that this metadata is associated with the top level section
255 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
256
257 # do plugin specific processing of doc_obj
258 print $outhandle "segment $segment\n" if ($self->{'verbosity'});
259 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
260 if (!defined $status) {
261 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
262 if ($verbosity >= 2);
263 next;
264 }
265 # If the plugin returned 0, it threw away this part
266 if ($status == 0) {
267 next;
268 }
269 $count += $status;
270
271 # do any automatic metadata extraction
272 $self->auto_extract_metadata ($doc_obj);
273
274 # add an OID
275 $self->set_OID($doc_obj, $id, $segment);
276
277 # process the document
278 $processor->process($doc_obj);
279
280 $self->{'num_processed'} ++;
281 }
282
283 delete $self->{'metapass_srcdoc'}->{$file};
284
285 # Return number of document objects produced
286 return $count;
287}
288
289sub set_OID {
290 my $self = shift (@_);
291 my ($doc_obj, $id, $segment_number) = @_;
292
293 $doc_obj->set_OID($id . "s" . $segment_number);
294}
295
2961;
Note: See TracBrowser for help on using the repository browser.