source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 7.0 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use util;
44
45
46# SplitPlug is a sub-class of BasPlug.
47sub BEGIN {
48 @ISA = ('BasPlug');
49}
50
51my $arguments =
52 [ { 'name' => "split_exp",
53 'desc' => "{SplitPlug.split_exp}",
54 'type' => "regexp",
55 'deft' => &get_default_split_exp(),
56 'reqd' => "no" } ];
57
58my $options = { 'name' => "SplitPlug",
59 'desc' => "{SplitPlug.desc}",
60 'abstract' => "yes",
61 'inherits' => "yes",
62 'args' => $arguments };
63
64
65sub new {
66 my ($class) = @_;
67 $self = new BasPlug($class, @_);
68
69 $self->{'plugin_type'} = "SplitPlug";
70
71 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
72 my $option_list = $self->{'option_list'};
73 push( @{$option_list}, $options );
74
75 if (!parsargv::parse(\@_,
76 q^split_exp/.*/^, \$self->{'split_exp'},
77 "allow_extra_options")) {
78 print STDERR "\nIncorrect options passed to $class.";
79 print STDERR "\nCheck your collect.cfg configuration file\n";
80 die "\n";
81 }
82
83 return bless $self, $class;
84}
85
86sub init {
87 my $self = shift (@_);
88 my ($verbosity, $outhandle, $failhandle) = @_;
89
90 $self->BasPlug::init($verbosity, $outhandle, $failhandle);
91
92 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
93
94 $self->{'process_exp'} = $self->get_default_process_exp ();
95 if ($self->{'process_exp'} eq "") {
96 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
97 }
98 }
99
100
101 # set split_exp to default unless explicitly set
102 if (!$self->{'split_exp'}) {
103 $self->{'split_exp'} = $self->get_default_split_exp ();
104 }
105
106}
107
108# This plugin recurs over the segments it finds
109sub is_recursive {
110 return 1;
111}
112
113# By default, we split the input text at blank lines
114sub get_default_split_exp {
115 return q^\n\s*\n^;
116}
117
118
119# The read function opens a file and splits it into parts.
120# Each part is sent to the process function
121#
122# Returns: Number of document objects created (or undef if it fails)
123
124sub read {
125 my $self = shift (@_);
126 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
127 my $outhandle = $self->{'outhandle'};
128 my $verbosity = $self->{'verbosity'};
129
130 # Figure out the exact filename of this file (and maybe block it)
131 my $filename = &util::filename_cat($base_dir, $file);
132 my $block_exp = $self->{'block_exp'};
133 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
134 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
135 return undef;
136 }
137 my $plugin_name = ref ($self);
138 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
139
140 # Do encoding stuff
141 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
142
143 # Read in file ($text will be in utf8)
144 my $text = "";
145 $self->read_file ($filename, $encoding, $language, \$text);
146
147 if ($text !~ /\w/) {
148 my $outhandle = $self->{'outhandle'};
149 print $outhandle "$plugin_name: ERROR: $file contains no text\n"
150 if $self->{'verbosity'};
151
152 my $failhandle = $self->{'failhandle'};
153 print $failhandle "$file: " . ref($self) . ": file contains no text\n";
154 $self->{'num_not_processed'} ++;
155
156 return 0;
157 }
158
159
160 # Split the text into several smaller segments
161 my $split_exp = $self->{'split_exp'};
162 my @segments = split(/$split_exp/, $text);
163 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
164 if $self->{'verbosity'};
165
166 # Process each segment in turn
167 my ($count, $segment, $segtext, $status, $id);
168 $segment = 0;
169 $count = 0;
170 foreach $segtext (@segments) {
171 $segment++;
172
173 # create a new document
174 my $doc_obj = new doc ($filename, "indexed_doc");
175 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
176 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
177 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
178 my ($filemeta) = $file =~ /([^\\\/]+)$/;
179 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
180 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
181 if ($self->{'cover_image'}) {
182 $self->associate_cover_image($doc_obj, $filename);
183 }
184 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
185 # Calculate a "base" document ID.
186 if (!defined $id) {
187 $doc_obj->set_OID();
188 $id = $doc_obj->get_OID();
189 }
190
191 # include any metadata passed in from previous plugins
192 # note that this metadata is associated with the top level section
193 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
194
195 # do plugin specific processing of doc_obj
196 print $outhandle "segment $segment - " if ($self->{'verbosity'});
197 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
198 if (!defined $status) {
199 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
200 if ($verbosity >= 2);
201 next;
202 }
203 # If the plugin returned 0, it threw away this part
204 if ($status == 0) {
205 next;
206 }
207 $count += $status;
208
209 # do any automatic metadata extraction
210 $self->auto_extract_metadata ($doc_obj);
211
212 # add an OID
213 $self->set_OID($doc_obj, $id, $segment);
214
215 # process the document
216 $processor->process($doc_obj);
217
218 $self->{'num_processed'} ++;
219 }
220
221 # Return number of document objects produced
222 return $count;
223}
224
225sub set_OID {
226 my $self = shift (@_);
227 my ($doc_obj, $id, $segment_number) = @_;
228
229 $doc_obj->set_OID($id . "s" . $segment_number);
230}
231
2321;
Note: See TracBrowser for help on using the repository browser.