source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1379

Last change on this file since 1379 was 1379, checked in by paynter, 24 years ago

Fixed bug that gave gsdlsourcedocument metadata relative path instead
of absolute, and then didn't test if it existed, causing NULL hash values.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28use parsargv;
29use multiread;
30use cnseg;
31use acronym;
32use textcat;
33use strict;
34use doc;
35use diagnostics;
36
37sub print_general_usage {
38 my ($plugin_name) = @_;
39
40 print STDERR "\n usage: plugin $plugin_name [options]\n\n";
41 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
42 print STDERR " converted from these encodings and stored internally as\n";
43 print STDERR " utf8. The default input_encoding is ascii. Accepted values\n";
44 print STDERR " are:\n";
45 print STDERR " iso_8859_1 (extended ascii)\n";
46 print STDERR " Latin1 (the same as iso-8859-1)\n";
47 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n";
48 print STDERR " conversion is neccessary)\n";
49 print STDERR " gb (GB or GBK simplified Chinese)\n";
50 print STDERR " iso_8859_6 (8 bit Arabic)\n";
51 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n";
52 print STDERR " Arabic (the same as windows_1256)\n";
53 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n";
54 print STDERR " unicode (just unicode -- doesn't currently do endian\n";
55 print STDERR " detection)\n";
56 print STDERR " -process_exp A perl regular expression to match against filenames.\n";
57 print STDERR " Matching filenames will be processed by this plugin.\n";
58 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
59 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
60 print STDERR " .htm or .html (case-insensitive).\n";
61 print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
62 print STDERR " being passed to any further plugins in the list. This has no\n";
63 print STDERR " real effect other than to prevent lots of warning messages\n";
64 print STDERR " about input files you don't care about. Each plugin may or may\n";
65 print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n";
66 print STDERR " any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
67 print STDERR " file extensions.\n";
68 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
69 print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n";
70}
71
72# print_usage should be overridden for any sub-classes having
73# their own plugin specific options
74sub print_usage {
75 print STDERR "\nThis plugin has no plugin specific options\n\n";
76
77}
78
79sub new {
80 my $class = shift (@_);
81 my $plugin_name = shift (@_);
82
83 my $self = {};
84 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
85
86 # general options available to all plugins
87 if (!parsargv::parse(\@_,
88 qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
89 q^process_exp/.*/^, \$self->{'process_exp'},
90 q^block_exp/.*/^, \$self->{'block_exp'},
91 q^extract_acronyms^, \$self->{'extract_acronyms'},
92 q^extract_language^, \$self->{'extract_language'},
93 "allow_extra_options")) {
94
95 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
96 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
97 &print_general_usage($plugin_name);
98 die "\n";
99 }
100
101 return bless $self, $class;
102}
103
104# initialize BasPlug options
105# if init() is overridden in a sub-class, remember to call BasPlug::init()
106sub init {
107 my $self = shift (@_);
108 my ($verbosity) = @_;
109
110 # verbosity is passed through from the processor
111 $self->{'verbosity'} = $verbosity;
112
113 # set process_exp and block_exp to defaults unless they were
114 # explicitly set
115
116 if ((!$self->is_recursive()) and
117 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
118
119 $self->{'process_exp'} = $self->get_default_process_exp ();
120 if ($self->{'process_exp'} eq "") {
121 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
122 }
123 }
124
125 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
126 $self->{'block_exp'} = $self->get_default_block_exp ();
127 }
128
129 # handle input_encoding aliases
130 $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
131 $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
132}
133
134sub begin {
135 my $self = shift (@_);
136 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
137}
138
139sub end {
140 my ($self) = @_;
141}
142
143# this function should be overridden to return 1
144# in recursive plugins
145sub is_recursive {
146 my $self = shift (@_);
147
148 return 0;
149}
150
151sub get_default_block_exp {
152 my $self = shift (@_);
153
154 return "";
155}
156
157sub get_default_process_exp {
158 my $self = shift (@_);
159
160 return "";
161}
162
163# The BasPlug read() function. This function does all the right things
164# to make general options work for a given plugin. It calls the process()
165# function which does all the work specific to a plugin (like the old
166# read functions used to do). Most plugins should define their own
167# process() function and let this read() function keep control.
168#
169# recursive plugins (e.g. RecPlug) and specialized plugins like those
170# capable of processing many documents within a single file (e.g.
171# GMLPlug) should normally implement their own version of read()
172#
173# Return number of files processed, undef if can't process
174# Note that $base_dir might be "" and that $file might
175# include directories
176
177sub read {
178 my $self = shift (@_);
179 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
180
181 if ($self->is_recursive()) {
182 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
183 }
184
185 my $filename = &util::filename_cat($base_dir, $file);
186 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
187 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
188 return undef;
189 }
190 my $plugin_name = ref ($self);
191 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
192
193 # create a new document
194 my $doc_obj = new doc ($filename, "indexed_doc");
195
196 # read in file ($text will be in utf8)
197 my $text = "";
198 $self->read_file ($filename, \$text);
199
200 if ($text !~ /\w/) {
201 print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
202 return 0;
203 }
204
205 # include any metadata passed in from previous plugins
206 # note that this metadata is associated with the top level section
207 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
208
209 # do plugin specific processing of doc_obj
210 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
211
212 # do any automatic metadata extraction
213 $self->auto_extract_metadata ($doc_obj);
214
215 # add an OID
216 $doc_obj->set_OID();
217
218 # process the document
219 $processor->process($doc_obj);
220
221 return 1; # processed the file
222}
223
224# returns undef if file is rejected by the plugin
225sub process {
226 my $self = shift (@_);
227 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
228
229 die "Basplug::process function must be implemented in sub-class\n";
230
231 return undef; # never gets here
232}
233
234# uses the multiread package to read in the entire file pointed to
235# by filename and loads the resulting text into $$textref. Input text
236# may be in any of the encodings handled by multiread, output text
237# will be in utf8
238sub read_file {
239 my $self = shift (@_);
240 my ($filename, $textref) = @_;
241
242 $$textref = "";
243
244 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
245
246 if ($self->{'input_encoding'} eq "ascii") {
247 undef $/;
248 $$textref = <FILE>;
249 $/ = "\n";
250 } else {
251 my $reader = new multiread();
252 $reader->set_handle ('BasPlug::FILE');
253 $reader->set_encoding ($self->{'input_encoding'});
254 $reader->read_file ($textref);
255
256 if ($self->{'input_encoding'} eq "gb") {
257 # segment the Chinese words
258 $$textref = &cnseg::segment($$textref);
259 }
260 }
261
262 close FILE;
263}
264
265# add any extra metadata that's been passed around from one
266# plugin to another.
267# extra_metadata uses add_utf8_metadata so it expects metadata values
268# to already be in utf8
269sub extra_metadata {
270 my $self = shift (@_);
271 my ($doc_obj, $cursection, $metadata) = @_;
272
273 foreach my $field (keys(%$metadata)) {
274 # $metadata->{$field} may be an array reference
275 if (ref ($metadata->{$field}) eq "ARRAY") {
276 map {
277 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
278 } @{$metadata->{$field}};
279 } else {
280 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
281 }
282 }
283}
284
285# extract acronyms (and hopefully other stuff soon too).
286sub auto_extract_metadata {
287 my $self = shift (@_);
288 my ($doc_obj) = @_;
289
290 if ($self->{'extract_acronyms'}) {
291 my $thissection = $doc_obj->get_top_section();
292 while (defined $thissection) {
293 my $text = $doc_obj->get_text($thissection);
294 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
295 $thissection = $doc_obj->get_next_section ($thissection);
296 }
297 }
298
299 if ($self->{'extract_language'}) {
300 my $thissection = $doc_obj->get_top_section();
301 while (defined $thissection) {
302 my $text = $doc_obj->get_text($thissection);
303 $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
304 $thissection = $doc_obj->get_next_section ($thissection);
305 }
306 }
307
308}
309
310
311# Identify the language of a section and add it to the metadata
312sub extract_language {
313 my $self = shift (@_);
314 my ($textref, $doc_obj, $thissection) = @_;
315
316 # remove all HTML tags
317 my $text = $$textref;
318 $text =~ s/<P[^>]*>/\n/sgi;
319 $text =~ s/<H[^>]*>/\n/sgi;
320 $text =~ s/<[^>]*>//sgi;
321 $text =~ tr/\n/\n/s;
322
323 # get the language
324 my @results = textcat::classify($text);
325 @results = ("unknown") if ($#results > 2);
326
327 my $language = join(" or ", @results);
328 $doc_obj->add_utf8_metadata($thissection, "Language", $language);
329 print "Language: $language\n";
330
331}
332
333# extract acronyms from a section in a document. progress is
334# reported to STDERR based on the verbosity. both the Acronym
335# and the AcronymKWIC metadata items are created.
336
337sub extract_acronyms {
338 my $self = shift (@_);
339 my ($textref, $doc_obj, $thissection) = @_;
340
341 print STDERR " checking for acronyms ...\n"
342 if ($self->{'verbosity'} >= 2);
343
344 my $acro_array = &acronym::acronyms($textref);
345
346 foreach my $acro (@$acro_array) {
347
348 #check that this is the first time ...
349 my $seen_before = "false";
350 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
351 foreach my $thisAcro (@$previous_data) {
352 if ($thisAcro eq $acro->to_string())
353 {
354 $seen_before = "true";
355 print STDERR " already seen ". $acro->to_string() . "\n"
356 if ($self->{'verbosity'} >= 2);
357 }
358 }
359
360 if ($seen_before eq "false")
361 {
362 #do the normal acronym
363 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
364 print STDERR " adding ". $acro->to_string() . "\n"
365 if ($self->{'verbosity'} >= 1);
366
367 # do the KWIC (Key Word In Context) acronym
368 my @kwic = $acro->to_string_kwic();
369 foreach my $kwic (@kwic) {
370 $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC", $kwic);
371 print STDERR " adding ". $kwic . "\n"
372 if ($self->{'verbosity'} >= 2);
373 }
374 }
375 }
376 print STDERR " done with acronyms. \n"
377 if ($self->{'verbosity'} >= 2);
378}
379
3801;
Note: See TracBrowser for help on using the repository browser.