source: main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm@ 25742

Last change on this file since 25742 was 25742, checked in by kjdon, 12 years ago

change to use can_process_this_file instead of metadata_read to test whether we can process this file or not. Also added a few segment ouptput messages for gli

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
RevLine 
[1676]1###########################################################################
2#
[15871]3# SplitTextFile.pm - a plugin for splitting input files into segments that
[1676]4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
[15871]31# SplitTextFile is a plugin for splitting input files into segments that will
[1676]32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
[15871]37# and have it inherit from SplitTextFile. See ReferPlug for an example.
[1676]38
39
[15871]40package SplitTextFile;
[1676]41
[15871]42use ReadTextFile;
[7830]43use gsprintf 'gsprintf';
[1676]44use util;
45
[10254]46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
[16700]49# SplitTextFile is a sub-class of ReadTextFile
[8716]50sub BEGIN {
[15871]51 @SplitTextFile::ISA = ('ReadTextFile');
[8716]52}
[1676]53
54
[4744]55my $arguments =
56 [ { 'name' => "split_exp",
[15871]57 'desc' => "{SplitTextFile.split_exp}",
[6408]58 'type' => "regexp",
[10218]59 #'deft' => &get_default_split_exp(),
60 'deft' => "",
[4873]61 'reqd' => "no" } ];
[4744]62
[15871]63my $options = { 'name' => "SplitTextFile",
64 'desc' => "{SplitTextFile.desc}",
[6408]65 'abstract' => "yes",
[4744]66 'inherits' => "yes",
67 'args' => $arguments };
[3540]68
69
[1676]70sub new {
[10218]71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
[1676]74
[15871]75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
[1676]77
[15871]78 my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[10218]79
[9357]80 $self->{'textcat_store'} = {};
81 $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
[1676]82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
[3094]87 my ($verbosity, $outhandle, $failhandle) = @_;
[1676]88
[15871]89 $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
[1676]90
[15871]91 # why is this is init and not in new??
[2007]92 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
93
94 $self->{'process_exp'} = $self->get_default_process_exp ();
95 if ($self->{'process_exp'} eq "") {
[11090]96 warn ref($self) . " Warning: plugin has no process_exp\n";
[2007]97 }
98 }
99
100
[1676]101 # set split_exp to default unless explicitly set
102 if (!$self->{'split_exp'}) {
103 $self->{'split_exp'} = $self->get_default_split_exp ();
104 }
105
106}
107
108# This plugin recurs over the segments it finds
109sub is_recursive {
110 return 1;
111}
112
113# By default, we split the input text at blank lines
114sub get_default_split_exp {
115 return q^\n\s*\n^;
116}
117
[9357]118sub metadata_read {
119 my $self = shift (@_);
[19493]120 my ($pluginfo, $base_dir, $file, $block_hash,
121 $extrametakeys, $extrametadata, $extrametafile,
[23212]122 $processor, $gli, $aux) = @_;
[25742]123
124 # can we process this file??
125 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
126 return undef unless $self->can_process_this_file($filename_full_path);
[1676]127
[25742]128 my $outhandle = $self->{'outhandle'};
129 my $filename = &util::filename_cat($base_dir, $file);
[9357]130
131 my $plugin_name = ref ($self);
132 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
133
134 $self->{'metapass_srcdoc'}->{$file} = {};
135
136 # Do encoding stuff
137 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
138 my $le_rec = { 'language' => $language, 'encoding' => $encoding };
139 $self->{'textcat_store'}->{$file} = $le_rec;
140
141 # Read in file ($text will be in utf8)
142 my $text = "";
143 $self->read_file ($filename, $encoding, $language, \$text);
144
[13197]145
[9357]146 if ($text !~ /\w/) {
[15871]147 gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
[9357]148 $file)
149 if $self->{'verbosity'};
150
151 my $failhandle = $self->{'failhandle'};
152 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
153 $self->{'num_not_processed'} ++;
154
155 $self->{'textcat_store'}->{$file} = undef;
156
157 return 0;
158 }
159
160
161 # Split the text into several smaller segments
162 my $split_exp = $self->{'split_exp'};
[13197]163 my @tmp = split(/$split_exp/i, $text);
164 my @segments =();
165 ## get rid of empty segments
166 foreach my $seg (@tmp){
167 if ($seg ne ""){
168 push @segments, $seg;
169 }
170 }
171
[15871]172 print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
[9357]173 if $self->{'verbosity'};
174
[9493]175 $self->{'split_segments'}->{$file} = \@segments;
[9357]176
[25742]177 return scalar(@segments);
[9357]178}
179
180
181
[1676]182# The read function opens a file and splits it into parts.
183# Each part is sent to the process function
184#
185# Returns: Number of document objects created (or undef if it fails)
186
187sub read {
188 my $self = shift (@_);
[16392]189 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1676]190 my $outhandle = $self->{'outhandle'};
191 my $verbosity = $self->{'verbosity'};
192
[16392]193 # can we process this file??
194 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
195 return undef unless $self->can_process_this_file($filename_full_path);
[11090]196
[1676]197 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1894]198
[9357]199 my $le_rec = $self->{'textcat_store'}->{$file};
200 if (!defined $le_rec) {
201 # means no text was found;
202 return 0; # not processed but no point in passing it on
203 }
[2845]204
[16104]205 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
206 print $outhandle "$self->{'plugin_type'} processing $file\n"
207 if $self->{'verbosity'} > 1;
208
[9357]209 my $language = $le_rec->{'language'};
210 my $encoding = $le_rec->{'encoding'};
211 $self->{'textcat_store'}->{$file} = undef;
[1676]212
[9493]213 my $segments = $self->{'split_segments'}->{$file};
214 $self->{'split_segments'}->{$file} = undef;
[2845]215
[1676]216 # Process each segment in turn
217 my ($count, $segment, $segtext, $status, $id);
218 $segment = 0;
219 $count = 0;
[9357]220 foreach $segtext (@$segments) {
[13197]221 $segment++;
[2845]222
[9357]223 if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
224 # metadata is attached to a srcdoc
225 next;
226 }
227
[1676]228 # create a new document
[18327]229 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
[1894]230 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
231 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
[23349]232
[2845]233 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[23349]234 my $plugin_filename_encoding = $self->{'filename_encoding'};
235 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23352]236 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
[23349]237
[6137]238 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
[2845]239 if ($self->{'cover_image'}) {
[16392]240 $self->associate_cover_image($doc_obj, $filename_full_path);
[2845]241 }
[7508]242 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[8121]243 #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
244
[1676]245 # Calculate a "base" document ID.
246 if (!defined $id) {
[17026]247 $id = $self->get_base_OID($doc_obj);
[1676]248 }
249
250 # include any metadata passed in from previous plugins
251 # note that this metadata is associated with the top level section
252 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
253
254 # do plugin specific processing of doc_obj
[16104]255 print $outhandle "segment $segment\n" if ($self->{'verbosity'});
[25742]256 print STDERR "<Processing s='$segment' n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
[11335]257 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
[1676]258 if (!defined $status) {
[25742]259 print $outhandle "WARNING: no plugin could process segment $segment of $file\n"
[1676]260 if ($verbosity >= 2);
[25742]261 print STDERR "<ProcessingError s='$segment' n='$file'>\n" if $gli;
[1676]262 next;
263 }
[3537]264 # If the plugin returned 0, it threw away this part
265 if ($status == 0) {
266 next;
267 }
[1676]268 $count += $status;
269
270 # do any automatic metadata extraction
271 $self->auto_extract_metadata ($doc_obj);
272
273 # add an OID
[17026]274 $self->add_OID($doc_obj, $id, $segment);
[2484]275
[1676]276 # process the document
277 $processor->process($doc_obj);
[2845]278
279 $self->{'num_processed'} ++;
[1676]280 }
281
[9357]282 delete $self->{'metapass_srcdoc'}->{$file};
283
[1676]284 # Return number of document objects produced
285 return $count;
286}
287
[17026]288sub get_base_OID {
289 my $self = shift(@_);
290 my ($doc_obj) = @_;
291
292 $self->SUPER::add_OID($doc_obj);
293 return $doc_obj->get_OID();
294}
295
296sub add_OID {
[2484]297 my $self = shift (@_);
[17026]298 my ($doc_obj, $id, $segment) = @_;
299
300 my $full_id = $id . "s" . $segment;
301 if ($self->{'OIDtype'} eq "assigned") {
302 my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
303 if (defined $identifier && $identifier ne "") {
304 $full_id = $identifier;
[19617]305 $full_id = &util::tidy_up_oid($full_id);
[17026]306 }
307 }
308 $doc_obj->set_OID($full_id);
[2484]309}
310
[17026]311
[1676]3121;
Note: See TracBrowser for help on using the repository browser.