source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1244

Last change on this file since 1244 was 1244, checked in by sjboddie, 24 years ago

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28use parsargv;
29use multiread;
30use cnseg;
31use acronym;
32use strict;
33use doc;
34
35sub print_general_usage {
36 my ($plugin_name) = @_;
37
38 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
39 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
40 print STDERR " converted from these encodings and stored internally as\n";
41 print STDERR " utf8. The default input_encoding is Latin1. Accepted values\n";
42 print STDERR " are:\n";
43 print STDERR " iso_8859_1 (extended ascii)\n";
44 print STDERR " Latin1 (the same as iso-8859-1)\n";
45 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n";
46 print STDERR " conversion is neccessary)\n";
47 print STDERR " gb (GB or GBK simplified Chinese)\n";
48 print STDERR " iso_8859_6 (8 bit Arabic)\n";
49 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n";
50 print STDERR " Arabic (the same as windows_1256)\n";
51 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n";
52 print STDERR " unicode (just unicode -- doesn't currently do endian\n";
53 print STDERR " detection)\n";
54 print STDERR " -process_exp A perl regular expression to match against filenames.\n";
55 print STDERR " Matching filenames will be processed by this plugin.\n";
56 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
57 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
58 print STDERR " .htm or .html (case-insensitive).\n";
59 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
60 print STDERR " being passed to any further plugins in the list. This has no\n";
61 print STDERR " real effect other than to prevent lots of warning messages\n";
62 print STDERR " about input files you don't care about. Each plugin may or may\n";
63 print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n";
64 print STDERR " any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
65 print STDERR " file extensions.\n";
66 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
67}
68
69# print_usage should be overridden for any sub-classes having
70# their own plugin specific options
71sub print_usage {
72 print STDERR "\nThis plugin has no plugin specific options\n\n";
73
74}
75
76sub new {
77 my $class = shift (@_);
78 my $plugin_name = shift (@_);
79
80 my $self = {};
81 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
82
83 # general options available to all plugins
84 if (!parsargv::parse(\@_,
85 qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'},
86 q^process_exp/.*/^, \$self->{'process_exp'},
87 q^block_exp/.*/^, \$self->{'block_exp'},
88 q^extract_acronyms^, \$self->{'extract_acronyms'},
89 "allow_extra_options")) {
90
91 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
92 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
93 &print_general_usage($plugin_name);
94 die "\n";
95 }
96
97 return bless $self, $class;
98}
99
100# initialize BasPlug options
101# if init() is overridden in a sub-class, remember to call BasPlug::init()
102sub init {
103 my $self = shift (@_);
104 my ($verbosity) = @_;
105
106 # verbosity is passed through from the processor
107 $self->{'verbosity'} = $verbosity;
108
109 # set process_exp and block_exp to defaults unless they were
110 # explicitly set
111
112 if ((!$self->is_recursive()) and
113 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
114
115 $self->{'process_exp'} = $self->get_default_process_exp ();
116 if ($self->{'process_exp'} eq "") {
117 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
118 }
119 }
120
121 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
122 $self->{'block_exp'} = $self->get_default_block_exp ();
123 }
124
125 # handle input_encoding aliases
126 $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
127 $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
128}
129
130sub begin {
131 my $self = shift (@_);
132 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
133}
134
135sub end {
136 my ($self) = @_;
137}
138
139# this function should be overridden to return 1
140# in recursive plugins
141sub is_recursive {
142 my $self = shift (@_);
143
144 return 0;
145}
146
147sub get_default_block_exp {
148 my $self = shift (@_);
149
150 return "";
151}
152
153sub get_default_process_exp {
154 my $self = shift (@_);
155
156 return "";
157}
158
159# The BasPlug read() function. This function does all the right things
160# to make general options work for a given plugin. It calls the process()
161# function which does all the work specific to a plugin (like the old
162# read functions used to do). Most plugins should define their own
163# process() function and let this read() function keep control.
164#
165# recursive plugins (e.g. RecPlug) and specialized plugins like those
166# capable of processing many documents within a single file (e.g.
167# GMLPlug) should normally implement their own version of read()
168#
169# Return number of files processed, undef if can't process
170# Note that $base_dir might be "" and that $file might
171# include directories
172
173sub read {
174 my $self = shift (@_);
175 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
176
177 if ($self->is_recursive()) {
178 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
179 }
180
181 my $filename = &util::filename_cat($base_dir, $file);
182 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
183 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
184 return undef;
185 }
186 my $plugin_name = ref ($self);
187 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
188
189 # create a new document
190 my $doc_obj = new doc ($file, "indexed_doc");
191
192 # read in file ($text will be in utf8)
193 my $text = "";
194 $self->read_file ($filename, \$text);
195
196 if ($text !~ /\w/) {
197 print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
198 return 0;
199 }
200
201 # include any metadata passed in from previous plugins
202 # note that this metadata is associated with the top level section
203 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
204
205 # do plugin specific processing of doc_obj
206 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
207
208 # do any automatic metadata extraction
209 $self->auto_extract_metadata ($doc_obj);
210
211 # add an OID
212 $doc_obj->set_OID();
213
214 # process the document
215 $processor->process($doc_obj);
216
217 return 1; # processed the file
218}
219
220# returns undef if file is rejected by the plugin
221sub process {
222 my $self = shift (@_);
223 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
224
225 die "Basplug::process function must be implemented in sub-class\n";
226
227 return undef; # never gets here
228}
229
230# uses the multiread package to read in the entire file pointed to
231# by filename and loads the resulting text into $$textref. Input text
232# may be in any of the encodings handled by multiread, output text
233# will be in utf8
234sub read_file {
235 my $self = shift (@_);
236 my ($filename, $textref) = @_;
237
238 $$textref = "";
239
240 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
241
242 if ($self->{'input_encoding'} eq "ascii") {
243 undef $/;
244 $$textref = <FILE>;
245 $/ = "\n";
246 } else {
247 my $reader = new multiread();
248 $reader->set_handle ('BasPlug::FILE');
249 $reader->set_encoding ($self->{'input_encoding'});
250 $reader->read_file ($textref);
251
252 if ($self->{'input_encoding'} eq "gb") {
253 # segment the Chinese words
254 $$textref = &cnseg::segment($$textref);
255 }
256 }
257
258 close FILE;
259}
260
261# add any extra metadata that's been passed around from one
262# plugin to another.
263# extra_metadata uses add_utf8_metadata so it expects metadata values
264# to already be in utf8
265sub extra_metadata {
266 my $self = shift (@_);
267 my ($doc_obj, $cursection, $metadata) = @_;
268
269 foreach my $field (keys(%$metadata)) {
270 # $metadata->{$field} may be an array reference
271 if (ref ($metadata->{$field}) eq "ARRAY") {
272 map {
273 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
274 } @{$metadata->{$field}};
275 } else {
276 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
277 }
278 }
279}
280
281# extract acronyms (and hopefully other stuff soon too).
282sub auto_extract_metadata {
283 my $self = shift (@_);
284 my ($doc_obj) = @_;
285
286 if ($self->{'extract_acronyms'}) {
287 my $thissection = $doc_obj->get_top_section();
288 while (defined $thissection) {
289 my $text = $doc_obj->get_text($thissection);
290 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
291 $thissection = $doc_obj->get_next_section ($thissection);
292 }
293 }
294}
295
296sub extract_acronyms {
297 my $self = shift (@_);
298 my ($textref, $doc_obj, $thissection) = @_;
299
300 my $acro_array = &acronym::acronyms($textref);
301
302 foreach my $acro (@$acro_array) {
303
304 #do the normal acronym
305 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
306 print "found " . $acro->to_string() . "\n";
307
308 # do the KWIC (Key Word In Context) acronym
309 my @kwic = $acro->to_string_kwic();
310 foreach my $kwic (@kwic) {
311 $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC", $kwic);
312 print "found (KWIC)" . $kwic . "\n";
313 }
314 }
315}
316
3171;
Note: See TracBrowser for help on using the repository browser.