source: gsdl/trunk/perllib/plugins/SplitTextFile.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
RevLine 
[1676]1###########################################################################
2#
[15871]3# SplitTextFile.pm - a plugin for splitting input files into segments that
[1676]4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
[15871]31# SplitTextFile is a plugin for splitting input files into segments that will
[1676]32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
[15871]37# and have it inherit from SplitTextFile. See ReferPlug for an example.
[1676]38
39
[15871]40package SplitTextFile;
[1676]41
[15871]42use ReadTextFile;
[7830]43use gsprintf 'gsprintf';
[1676]44use util;
45
[10254]46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
[15871]49# SplitTextFile is a sub-class of BasPlug.
[8716]50sub BEGIN {
[15871]51 @SplitTextFile::ISA = ('ReadTextFile');
[8716]52}
[1676]53
54
[4744]55my $arguments =
56 [ { 'name' => "split_exp",
[15871]57 'desc' => "{SplitTextFile.split_exp}",
[6408]58 'type' => "regexp",
[10218]59 #'deft' => &get_default_split_exp(),
60 'deft' => "",
[4873]61 'reqd' => "no" } ];
[4744]62
[15871]63my $options = { 'name' => "SplitTextFile",
64 'desc' => "{SplitTextFile.desc}",
[6408]65 'abstract' => "yes",
[4744]66 'inherits' => "yes",
67 'args' => $arguments };
[3540]68
69
[1676]70sub new {
[10218]71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
[1676]74
[15871]75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
[1676]77
[15871]78 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]79
[9357]80 $self->{'textcat_store'} = {};
81 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
[1676]82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
[3094]87 my ($verbosity, $outhandle, $failhandle) = @_;
[1676]88
[15871]89 $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
[1676]90
[15871]91 # why is this is init and not in new??
[2007]92 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
93
94 $self->{'process_exp'} = $self->get_default_process_exp ();
95 if ($self->{'process_exp'} eq "") {
[11090]96 warn ref($self) . " Warning: plugin has no process_exp\n";
[2007]97 }
98 }
99
100
[1676]101 # set split_exp to default unless explicitly set
102 if (!$self->{'split_exp'}) {
103 $self->{'split_exp'} = $self->get_default_split_exp ();
104 }
105
106}
107
108# This plugin recurs over the segments it finds
109sub is_recursive {
110 return 1;
111}
112
113# By default, we split the input text at blank lines
114sub get_default_split_exp {
115 return q^\n\s*\n^;
116}
117
[9357]118sub metadata_read {
119 my $self = shift (@_);
120 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
[1676]121
[15871]122 # returns 1 if matches process_exp, and has done blocking in the meantime
[9357]123 my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
124 $metadata, $extrametakeys,
125 $extrametadata, $processor,
126 $maxdocs, $gli);
[10254]127 my $split_matched = undef;
[9357]128
129 if ($matched) {
130
131 my $outhandle = $self->{'outhandle'};
132 my $filename = &util::filename_cat($base_dir, $file);
133
134 my $plugin_name = ref ($self);
135 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
136
137 $self->{'metapass_srcdoc'}->{$file} = {};
138
139 # Do encoding stuff
140 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
141 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
142 $self->{'textcat_store'}->{$file} = $le_rec;
143
144 # Read in file ($text will be in utf8)
145 my $text = "";
146 $self->read_file ($filename, $encoding, $language, \$text);
147
[13197]148
[9357]149 if ($text !~ /\w/) {
[15871]150 gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
[9357]151 $file)
152 if $self->{'verbosity'};
153
154 my $failhandle = $self->{'failhandle'};
155 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
156 $self->{'num_not_processed'} ++;
157
158 $self->{'textcat_store'}->{$file} = undef;
159
160 return 0;
161 }
162
163
164 # Split the text into several smaller segments
165 my $split_exp = $self->{'split_exp'};
[13197]166 my @tmp = split(/$split_exp/i, $text);
167 my @segments =();
168 ## get rid of empty segments
169 foreach my $seg (@tmp){
170 if ($seg ne ""){
171 push @segments, $seg;
172 }
173 }
174
[15871]175 print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
[9357]176 if $self->{'verbosity'};
177
[9493]178 $self->{'split_segments'}->{$file} = \@segments;
[9357]179 $split_matched = scalar(@segments);
180 }
181
182 return $split_matched;
183}
184
185
186
[1676]187# The read function opens a file and splits it into parts.
188# Each part is sent to the process function
189#
190# Returns: Number of document objects created (or undef if it fails)
191
192sub read {
193 my $self = shift (@_);
[9853]194 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1676]195 my $outhandle = $self->{'outhandle'};
196 my $verbosity = $self->{'verbosity'};
197
[11090]198 #check process and block exps, smart block, etc
199 my ($block_status,$filename) = $self->read_block(@_);
200 return $block_status if ((!defined $block_status) || ($block_status==0));
201
[1676]202 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1894]203
[9357]204 my $le_rec = $self->{'textcat_store'}->{$file};
205 if (!defined $le_rec) {
206 # means no text was found;
207 return 0; # not processed but no point in passing it on
208 }
[2845]209
[16104]210 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
211 print $outhandle "$self->{'plugin_type'} processing $file\n"
212 if $self->{'verbosity'} > 1;
213
[9357]214 my $language = $le_rec->{'language'};
215 my $encoding = $le_rec->{'encoding'};
216 $self->{'textcat_store'}->{$file} = undef;
[1676]217
[9493]218 my $segments = $self->{'split_segments'}->{$file};
219 $self->{'split_segments'}->{$file} = undef;
[2845]220
[1676]221 # Process each segment in turn
222 my ($count, $segment, $segtext, $status, $id);
223 $segment = 0;
224 $count = 0;
[9357]225 foreach $segtext (@$segments) {
[13197]226 $segment++;
[2845]227
[9357]228 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
229 # metadata is attached to a srcdoc
230 next;
231 }
232
[1676]233 # create a new document
234 my $doc_obj = new doc ($filename, "indexed_doc");
[12270]235 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
[1894]236 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
237 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[2845]238 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[15871]239 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
[6137]240 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
[2845]241 if ($self->{'cover_image'}) {
[11090]242 $self->associate_cover_image($doc_obj, $filename);
[2845]243 }
[7508]244 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]245 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
246
[1676]247 # Calculate a "base" document ID.
248 if (!defined $id) {
249 $doc_obj->set_OID();
250 $id = $doc_obj->get_OID();
251 }
252
253 # include any metadata passed in from previous plugins
254 # note that this metadata is associated with the top level section
255 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
256
257 # do plugin specific processing of doc_obj
[16104]258 print $outhandle "segment $segment\n" if ($self->{'verbosity'});
[11335]259 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[1676]260 if (!defined $status) {
261 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
262 if ($verbosity >= 2);
263 next;
264 }
[3537]265 # If the plugin returned 0, it threw away this part
266 if ($status == 0) {
267 next;
268 }
[1676]269 $count += $status;
270
271 # do any automatic metadata extraction
272 $self->auto_extract_metadata ($doc_obj);
273
274 # add an OID
[2484]275 $self->set_OID($doc_obj, $id, $segment);
276
[1676]277 # process the document
278 $processor->process($doc_obj);
[2845]279
280 $self->{'num_processed'} ++;
[1676]281 }
282
[9357]283 delete $self->{'metapass_srcdoc'}->{$file};
284
[1676]285 # Return number of document objects produced
286 return $count;
287}
288
[2484]289sub set_OID {
290 my $self = shift (@_);
291 my ($doc_obj, $id, $segment_number) = @_;
292
[2492]293 $doc_obj->set_OID($id . "s" . $segment_number);
[2484]294}
295
[1676]2961;
Note: See TracBrowser for help on using the repository browser.