source: main/tags/2.41/gsdl/perllib/plugins/SplitPlug.pm@ 29031

Last change on this file since 29031 was 6137, checked in by kjdon, 21 years ago

added new metadata field - SourceSegment, set when the source doc has been split into parts, to indicate which part of the original Source this doc is

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use util;
44
45
46# SplitPlug is a sub-class of BasPlug.
47sub BEGIN {
48 @ISA = ('BasPlug');
49}
50
51my $arguments =
52 [ { 'name' => "split_exp",
53 'desc' => "{SplitPlug.split_exp}",
54 'type' => "string",
55 'deft' => &get_default_split_exp(),
56 'reqd' => "no" } ];
57
58my $options = { 'name' => "SplitPlug",
59 'desc' => "{SplitPlug.desc}",
60 'inherits' => "yes",
61 'args' => $arguments };
62
63
64sub new {
65 my ($class) = @_;
66 $self = new BasPlug($class, @_);
67
68 $self->{'plugin_type'} = "SplitPlug";
69
70 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
71 my $option_list = $self->{'option_list'};
72 push( @{$option_list}, $options );
73
74 if (!parsargv::parse(\@_,
75 q^split_exp/.*/^, \$self->{'split_exp'},
76 "allow_extra_options")) {
77 print STDERR "\nIncorrect options passed to $class.";
78 print STDERR "\nCheck your collect.cfg configuration file\n";
79 die "\n";
80 }
81
82 return bless $self, $class;
83}
84
85sub init {
86 my $self = shift (@_);
87 my ($verbosity, $outhandle, $failhandle) = @_;
88
89 $self->BasPlug::init($verbosity, $outhandle, $failhandle);
90
91 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
92
93 $self->{'process_exp'} = $self->get_default_process_exp ();
94 if ($self->{'process_exp'} eq "") {
95 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
96 }
97 }
98
99
100 # set split_exp to default unless explicitly set
101 if (!$self->{'split_exp'}) {
102 $self->{'split_exp'} = $self->get_default_split_exp ();
103 }
104
105}
106
107# This plugin recurs over the segments it finds
108sub is_recursive {
109 return 1;
110}
111
112# By default, we split the input text at blank lines
113sub get_default_split_exp {
114 return q^\n\s*\n^;
115}
116
117
118# The read function opens a file and splits it into parts.
119# Each part is sent to the process function
120#
121# Returns: Number of document objects created (or undef if it fails)
122
123sub read {
124 my $self = shift (@_);
125 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
126 my $outhandle = $self->{'outhandle'};
127 my $verbosity = $self->{'verbosity'};
128
129 # Figure out the exact filename of this file (and maybe block it)
130 my $filename = &util::filename_cat($base_dir, $file);
131 my $block_exp = $self->{'block_exp'};
132 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
133 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
134 return undef;
135 }
136 my $plugin_name = ref ($self);
137 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
138
139 # Do encoding stuff
140 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
141
142 # Read in file ($text will be in utf8)
143 my $text = "";
144 $self->read_file ($filename, $encoding, $language, \$text);
145
146 if ($text !~ /\w/) {
147 my $outhandle = $self->{'outhandle'};
148 print $outhandle "$plugin_name: ERROR: $file contains no text\n"
149 if $self->{'verbosity'};
150
151 my $failhandle = $self->{'failhandle'};
152 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
153 $self->{'num_not_processed'} ++;
154
155 return 0;
156 }
157
158
159 # Split the text into several smaller segments
160 my $split_exp = $self->{'split_exp'};
161 my @segments = split(/$split_exp/, $text);
162 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
163 if $self->{'verbosity'};
164
165 # Process each segment in turn
166 my ($count, $segment, $segtext, $status, $id);
167 $segment = 0;
168 $count = 0;
169 foreach $segtext (@segments) {
170 $segment++;
171
172 # create a new document
173 my $doc_obj = new doc ($filename, "indexed_doc");
174 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
175 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
176 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
177 my ($filemeta) = $file =~ /([^\\\/]+)$/;
178 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
179 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
180 if ($self->{'cover_image'}) {
181 $self->associate_cover_image($doc_obj, $filename);
182 }
183 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
184 # Calculate a "base" document ID.
185 if (!defined $id) {
186 $doc_obj->set_OID();
187 $id = $doc_obj->get_OID();
188 }
189
190 # include any metadata passed in from previous plugins
191 # note that this metadata is associated with the top level section
192 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
193
194 # do plugin specific processing of doc_obj
195 print $outhandle "segment $segment - " if ($self->{'verbosity'});
196 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
197 if (!defined $status) {
198 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
199 if ($verbosity >= 2);
200 next;
201 }
202 # If the plugin returned 0, it threw away this part
203 if ($status == 0) {
204 next;
205 }
206 $count += $status;
207
208 # do any automatic metadata extraction
209 $self->auto_extract_metadata ($doc_obj);
210
211 # add an OID
212 $self->set_OID($doc_obj, $id, $segment);
213
214 # process the document
215 $processor->process($doc_obj);
216
217 $self->{'num_processed'} ++;
218 }
219
220 # Return number of document objects produced
221 return $count;
222}
223
224sub set_OID {
225 my $self = shift (@_);
226 my ($doc_obj, $id, $segment_number) = @_;
227
228 $doc_obj->set_OID($id . "s" . $segment_number);
229}
230
2311;
Note: See TracBrowser for help on using the repository browser.