source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 2484

Last change on this file since 2484 was 2484, checked in by say1, 23 years ago

Changed SplitPlug to allow control over the OID. Changed BibTexPlug to be more permissive in the Bibtex format it accepts. Changed BibTexPlug to use the BibTex key as the OID.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use util;
44
45
46# SplitPlug is a sub-class of BasPlug.
47sub BEGIN {
48 @ISA = ('BasPlug');
49}
50
51sub new {
52 my ($class) = @_;
53 $self = new BasPlug($class, @_);
54
55 if (!parsargv::parse(\@_,
56 q^split_exp/.*/^, \$self->{'split_exp'},
57 "allow_extra_options")) {
58 print STDERR "\nIncorrect options passed to $class.";
59 print STDERR "\nCheck your collect.cfg configuration file\n";
60 die "\n";
61 }
62
63 return bless $self, $class;
64}
65
66sub init {
67 my $self = shift (@_);
68 my ($verbosity, $outhandle) = @_;
69
70 $self->BasPlug::init($verbosity, $outhandle);
71
72 if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
73
74 $self->{'process_exp'} = $self->get_default_process_exp ();
75 if ($self->{'process_exp'} eq "") {
76 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
77 }
78 }
79
80
81 # set split_exp to default unless explicitly set
82 if (!$self->{'split_exp'}) {
83 $self->{'split_exp'} = $self->get_default_split_exp ();
84 }
85
86}
87
88# This plugin recurs over the segments it finds
89sub is_recursive {
90 return 1;
91}
92
93# By default, we split the input text at blank lines
94sub get_default_split_exp {
95 return q^\n\s*\n^;
96}
97
98
99# The read function opens a file and splits it into parts.
100# Each part is sent to the process function
101#
102# Returns: Number of document objects created (or undef if it fails)
103
104sub read {
105 my $self = shift (@_);
106 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
107 my $outhandle = $self->{'outhandle'};
108 my $verbosity = $self->{'verbosity'};
109
110 # Figure out the exact filename of this file (and maybe block it)
111 my $filename = &util::filename_cat($base_dir, $file);
112 my $block_exp = $self->{'block_exp'};
113 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
114 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
115 return undef;
116 }
117 my $plugin_name = ref ($self);
118 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
119
120 my ($language, $encoding);
121 if ($self->{'input_encoding'} eq "auto") {
122 # use textcat to automatically work out the input encoding and language
123 ($language, $encoding) = $self->get_language_encoding ($filename);
124
125 } elsif ($self->{'extract_language'}) {
126 # use textcat to get language metadata
127 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
128 $encoding = $self->{'input_encoding'};
129
130 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
131 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
132 print $outhandle "appears to be encoded as $extracted_encoding.\n";
133 }
134
135 } else {
136 $language = $self->{'default_language'};
137 $encoding = $self->{'input_encoding'};
138 }
139
140 # Read in file ($text will be in utf8)
141 my $text = "";
142 $self->read_file ($filename, $encoding, \$text);
143
144 if ($text !~ /\w/) {
145 my $outhandle = $self->{'outhandle'};
146 print $outhandle "$plugin_name: ERROR: $file contains no text\n"
147 if $self->{'verbosity'};
148 return 0;
149 }
150
151
152 # Split the text into several smaller segments
153 my $split_exp = $self->{'split_exp'};
154 my @segments = split(/$split_exp/, $text);
155 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
156 if $self->{'verbosity'};
157
158 # Process each segment in turn
159 my ($count, $segment, $segtext, $status, $id);
160 $segment = 0;
161 $count = 0;
162 foreach $segtext (@segments) {
163 $segment++;
164
165 # create a new document
166 my $doc_obj = new doc ($filename, "indexed_doc");
167 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
168 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
169 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
170
171 # Calculate a "base" document ID.
172 if (!defined $id) {
173 $doc_obj->set_OID();
174 $id = $doc_obj->get_OID();
175 }
176
177 # include any metadata passed in from previous plugins
178 # note that this metadata is associated with the top level section
179 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
180
181 # do plugin specific processing of doc_obj
182 print $outhandle "segment $segment - ";
183 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
184 if (!defined $status) {
185 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
186 if ($verbosity >= 2);
187 next;
188 }
189 $count += $status;
190
191 # do any automatic metadata extraction
192 $self->auto_extract_metadata ($doc_obj);
193
194 # add an OID
195 $self->set_OID($doc_obj, $id, $segment);
196
197
198 # process the document
199 $processor->process($doc_obj);
200 }
201
202 # Return number of document objects produced
203 return $count;
204}
205
206sub set_OID {
207 my $self = shift (@_);
208 my ($doc_obj, $id, $segment_number) = @_;
209
210 $doc_obj->set_OID($id . "s" . $segment);
211
212}
213
2141;
Note: See TracBrowser for help on using the repository browser.