root/trunk/gsdl/perllib/plugins/BasPlug.pm @ 1379

Revision 1379, 12.9 KB (checked in by paynter, 19 years ago)

Fixed bug that gave gsdlsourcedocument metadata relative path instead
of absolute, and then didn't test if it existed, causing NULL hash values.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28use parsargv;
29use multiread;
30use cnseg;
31use acronym;
32use textcat;
33use strict;
34use doc;
35use diagnostics;
36
37sub print_general_usage {
38    my ($plugin_name) = @_;
39
40    print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
41    print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
42    print STDERR "                     converted from these encodings and stored internally as\n";
43    print STDERR "                     utf8. The default input_encoding is ascii. Accepted values\n";
44    print STDERR "                     are:\n";
45    print STDERR "                        iso_8859_1 (extended ascii)\n";
46    print STDERR "                        Latin1 (the same as iso-8859-1)\n";
47    print STDERR "                        ascii (7 bit ascii -- may be faster than Latin1 as no\n";
48    print STDERR "                               conversion is neccessary)\n";
49    print STDERR "                        gb (GB or GBK simplified Chinese)\n";
50    print STDERR "                        iso_8859_6 (8 bit Arabic)\n";
51    print STDERR "                        windows_1256 (Windows codepage 1256 (Arabic))\n";
52    print STDERR "                        Arabic (the same as windows_1256)\n";
53    print STDERR "                        utf8 (either utf8 or unicode -- automatically detected)\n";
54    print STDERR "                        unicode (just unicode -- doesn't currently do endian\n";
55    print STDERR "                                 detection)\n";
56    print STDERR "   -process_exp      A perl regular expression to match against filenames.\n";
57    print STDERR "                     Matching filenames will be processed by this plugin.\n";
58    print STDERR "                     Each plugin has its own default process_exp. e.g HTMLPlug\n";
59    print STDERR "                     defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
60    print STDERR "                     .htm or .html (case-insensitive).\n";
61    print STDERR "   -block_exp        Files matching this regular expression will be blocked from\n";
62    print STDERR "                     being passed to any further plugins in the list. This has no\n";
63    print STDERR "                     real effect other than to prevent lots of warning messages\n";
64    print STDERR "                     about input files you don't care about. Each plugin may or may\n";
65    print STDERR "                     not have a default block_exp. e.g. by default HTMLPlug blocks\n";
66    print STDERR "                     any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
67    print STDERR "                     file extensions.\n";
68    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
69    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
70}
71
72# print_usage should be overridden for any sub-classes having
73# their own plugin specific options
74sub print_usage {
75    print STDERR "\nThis plugin has no plugin specific options\n\n";
76
77}
78
79sub new {
80    my $class = shift (@_);
81    my $plugin_name = shift (@_);
82
83    my $self = {};
84    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
85
86    # general options available to all plugins
87    if (!parsargv::parse(\@_,
88             qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
89             q^process_exp/.*/^, \$self->{'process_exp'},
90             q^block_exp/.*/^, \$self->{'block_exp'},
91             q^extract_acronyms^, \$self->{'extract_acronyms'},
92             q^extract_language^, \$self->{'extract_language'},
93             "allow_extra_options")) {
94
95    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
96    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
97        &print_general_usage($plugin_name);
98    die "\n";
99    }
100
101    return bless $self, $class;
102}
103
104# initialize BasPlug options
105# if init() is overridden in a sub-class, remember to call BasPlug::init()
106sub init {
107    my $self = shift (@_);
108    my ($verbosity) = @_;
109
110    # verbosity is passed through from the processor
111    $self->{'verbosity'} = $verbosity;
112
113    # set process_exp and block_exp to defaults unless they were
114    # explicitly set
115
116    if ((!$self->is_recursive()) and
117    (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
118
119    $self->{'process_exp'} = $self->get_default_process_exp ();
120    if ($self->{'process_exp'} eq "") {
121        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
122    }
123    }
124
125    if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
126    $self->{'block_exp'} = $self->get_default_block_exp ();
127    }
128   
129    # handle input_encoding aliases
130    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
131    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
132}
133
134sub begin {
135    my $self = shift (@_);
136    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
137}
138
139sub end {
140    my ($self) = @_;
141}
142
143# this function should be overridden to return 1
144# in recursive plugins
145sub is_recursive {
146    my $self = shift (@_);
147
148    return 0;
149}
150
151sub get_default_block_exp {
152    my $self = shift (@_);
153
154    return "";
155}
156
157sub get_default_process_exp {
158    my $self = shift (@_);
159
160    return "";
161}
162
163# The BasPlug read() function. This function does all the right things
164# to make general options work for a given plugin. It calls the process()
165# function which does all the work specific to a plugin (like the old
166# read functions used to do). Most plugins should define their own
167# process() function and let this read() function keep control.
168#
169# recursive plugins (e.g. RecPlug) and specialized plugins like those
170# capable of processing many documents within a single file (e.g.
171# GMLPlug) should normally implement their own version of read()
172#
173# Return number of files processed, undef if can't process
174# Note that $base_dir might be "" and that $file might
175# include directories
176
177sub read {
178    my $self = shift (@_);
179    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
180
181    if ($self->is_recursive()) {
182    die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
183    }
184
185    my $filename = &util::filename_cat($base_dir, $file);
186    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
187    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
188    return undef;
189    }
190    my $plugin_name = ref ($self);
191    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
192   
193    # create a new document
194    my $doc_obj = new doc ($filename, "indexed_doc");
195   
196    # read in file ($text will be in utf8)
197    my $text = "";
198    $self->read_file ($filename, \$text);
199
200    if ($text !~ /\w/) {
201    print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
202    return 0;
203    }
204
205    # include any metadata passed in from previous plugins
206    # note that this metadata is associated with the top level section
207    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
208
209    # do plugin specific processing of doc_obj
210    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
211
212    # do any automatic metadata extraction
213    $self->auto_extract_metadata ($doc_obj);
214
215    # add an OID
216    $doc_obj->set_OID();
217
218    # process the document
219    $processor->process($doc_obj);
220
221    return 1; # processed the file
222}
223
224# returns undef if file is rejected by the plugin
225sub process {
226    my $self = shift (@_);
227    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
228
229    die "Basplug::process function must be implemented in sub-class\n";
230
231    return undef; # never gets here
232}
233
234# uses the multiread package to read in the entire file pointed to
235# by filename and loads the resulting text into $$textref. Input text
236# may be in any of the encodings handled by multiread, output text
237# will be in utf8
238sub read_file {
239    my $self = shift (@_);
240    my ($filename, $textref) = @_;
241
242    $$textref = "";
243
244    open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
245
246    if ($self->{'input_encoding'} eq "ascii") {
247    undef $/;
248    $$textref = <FILE>;
249    $/ = "\n";
250    } else {
251    my $reader = new multiread();
252    $reader->set_handle ('BasPlug::FILE');
253    $reader->set_encoding ($self->{'input_encoding'});
254    $reader->read_file ($textref);
255
256    if ($self->{'input_encoding'} eq "gb") {
257        # segment the Chinese words
258        $$textref = &cnseg::segment($$textref);
259    }
260    }
261
262    close FILE;
263}
264
265# add any extra metadata that's been passed around from one
266# plugin to another.
267# extra_metadata uses add_utf8_metadata so it expects metadata values
268# to already be in utf8
269sub extra_metadata {
270    my $self = shift (@_);
271    my ($doc_obj, $cursection, $metadata) = @_;
272
273    foreach my $field (keys(%$metadata)) {
274    # $metadata->{$field} may be an array reference
275    if (ref ($metadata->{$field}) eq "ARRAY") {
276        map {
277        $doc_obj->add_utf8_metadata ($cursection, $field, $_);
278        } @{$metadata->{$field}};
279    } else {
280        $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
281    }
282    }
283}
284
285# extract acronyms (and hopefully other stuff soon too).
286sub auto_extract_metadata {
287    my $self = shift (@_);
288    my ($doc_obj) = @_;
289
290    if ($self->{'extract_acronyms'}) {
291    my $thissection = $doc_obj->get_top_section();
292    while (defined $thissection) {
293        my $text = $doc_obj->get_text($thissection);
294        $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
295        $thissection = $doc_obj->get_next_section ($thissection);
296    }
297    }
298
299    if ($self->{'extract_language'}) {
300    my $thissection = $doc_obj->get_top_section();
301    while (defined $thissection) {
302        my $text = $doc_obj->get_text($thissection);
303        $self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
304        $thissection = $doc_obj->get_next_section ($thissection);
305    }
306    }
307
308}
309
310
311# Identify the language of a section and add it to the metadata
312sub extract_language {
313    my $self = shift (@_);
314    my ($textref, $doc_obj, $thissection) = @_;
315
316    # remove all HTML tags
317    my $text = $$textref;
318    $text =~ s/<P[^>]*>/\n/sgi;
319    $text =~ s/<H[^>]*>/\n/sgi;
320    $text =~ s/<[^>]*>//sgi;
321    $text =~ tr/\n/\n/s;
322
323    # get the language
324    my @results = textcat::classify($text);
325    @results = ("unknown") if ($#results > 2);
326
327    my $language = join(" or ", @results);
328    $doc_obj->add_utf8_metadata($thissection, "Language",  $language);
329    print "Language: $language\n";
330
331}
332
333# extract acronyms from a section in a document. progress is
334# reported to STDERR based on the verbosity. both the Acronym
335# and the AcronymKWIC metadata items are created.
336
337sub extract_acronyms {
338    my $self = shift (@_);
339    my ($textref, $doc_obj, $thissection) = @_;
340
341    print STDERR " checking for acronyms ...\n"
342    if ($self->{'verbosity'} >= 2);
343
344    my $acro_array =  &acronym::acronyms($textref);
345   
346    foreach my $acro (@$acro_array) {
347
348    #check that this is the first time ...
349    my $seen_before = "false";
350    my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
351    foreach my $thisAcro (@$previous_data) {
352        if ($thisAcro eq $acro->to_string())
353        {
354        $seen_before = "true";
355        print STDERR "  already seen ". $acro->to_string() . "\n"
356            if ($self->{'verbosity'} >= 2);
357        }       
358    }
359
360    if ($seen_before eq "false")
361    {
362        #do the normal acronym
363        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
364        print STDERR "  adding ". $acro->to_string() . "\n"
365            if ($self->{'verbosity'} >= 1);
366       
367        # do the KWIC (Key Word In Context) acronym
368        my @kwic = $acro->to_string_kwic();
369        foreach my $kwic (@kwic) {
370        $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
371        print STDERR "   adding ".  $kwic . "\n"
372            if ($self->{'verbosity'} >= 2);
373        }
374    }
375    }
376    print STDERR " done with acronyms. \n"
377    if ($self->{'verbosity'} >= 2);
378}
379
3801;
Note: See TracBrowser for help on using the browser.