Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1379

Last change on this file since 1379 was 1379, checked in by paynter, 24 years ago
Fixed bug that gave gsdlsourcedocument metadata relative path instead of absolute, and then didn't test if it existed, causing NULL hash values.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.9 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	use parsargv;
29	use multiread;
30	use cnseg;
31	use acronym;
32	use textcat;
33	use strict;
34	use doc;
35	use diagnostics;
36
37	sub print_general_usage {
38	my ($plugin_name) = @_;
39
40	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
41	print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
42	print STDERR " converted from these encodings and stored internally as\n";
43	print STDERR " utf8. The default input_encoding is ascii. Accepted values\n";
44	print STDERR " are:\n";
45	print STDERR " iso_8859_1 (extended ascii)\n";
46	print STDERR " Latin1 (the same as iso-8859-1)\n";
47	print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n";
48	print STDERR " conversion is neccessary)\n";
49	print STDERR " gb (GB or GBK simplified Chinese)\n";
50	print STDERR " iso_8859_6 (8 bit Arabic)\n";
51	print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n";
52	print STDERR " Arabic (the same as windows_1256)\n";
53	print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n";
54	print STDERR " unicode (just unicode -- doesn't currently do endian\n";
55	print STDERR " detection)\n";
56	print STDERR " -process_exp A perl regular expression to match against filenames.\n";
57	print STDERR " Matching filenames will be processed by this plugin.\n";
58	print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
59	print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
60	print STDERR " .htm or .html (case-insensitive).\n";
61	print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
62	print STDERR " being passed to any further plugins in the list. This has no\n";
63	print STDERR " real effect other than to prevent lots of warning messages\n";
64	print STDERR " about input files you don't care about. Each plugin may or may\n";
65	print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n";
66	print STDERR " any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n";
67	print STDERR " file extensions.\n";
68	print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
69	print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n";
70	}
71
72	# print_usage should be overridden for any sub-classes having
73	# their own plugin specific options
74	sub print_usage {
75	print STDERR "\nThis plugin has no plugin specific options\n\n";
76
77	}
78
79	sub new {
80	my $class = shift (@_);
81	my $plugin_name = shift (@_);
82
83	my $self = {};
84	my $encodings = "^(iso_8859_1\|Latin1\|ascii\|gb\|iso_8859_6\|windows_1256\|Arabic\|utf8\|unicode)\$";
85
86	# general options available to all plugins
87	if (!parsargv::parse(\@_,
88	qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
89	q^process_exp/.*/^, \$self->{'process_exp'},
90	q^block_exp/.*/^, \$self->{'block_exp'},
91	q^extract_acronyms^, \$self->{'extract_acronyms'},
92	q^extract_language^, \$self->{'extract_language'},
93	"allow_extra_options")) {
94
95	print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
96	print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
97	&print_general_usage($plugin_name);
98	die "\n";
99	}
100
101	return bless $self, $class;
102	}
103
104	# initialize BasPlug options
105	# if init() is overridden in a sub-class, remember to call BasPlug::init()
106	sub init {
107	my $self = shift (@_);
108	my ($verbosity) = @_;
109
110	# verbosity is passed through from the processor
111	$self->{'verbosity'} = $verbosity;
112
113	# set process_exp and block_exp to defaults unless they were
114	# explicitly set
115
116	if ((!$self->is_recursive()) and
117	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
118
119	$self->{'process_exp'} = $self->get_default_process_exp ();
120	if ($self->{'process_exp'} eq "") {
121	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
122	}
123	}
124
125	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
126	$self->{'block_exp'} = $self->get_default_block_exp ();
127	}
128
129	# handle input_encoding aliases
130	$self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
131	$self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
132	}
133
134	sub begin {
135	my $self = shift (@_);
136	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
137	}
138
139	sub end {
140	my ($self) = @_;
141	}
142
143	# this function should be overridden to return 1
144	# in recursive plugins
145	sub is_recursive {
146	my $self = shift (@_);
147
148	return 0;
149	}
150
151	sub get_default_block_exp {
152	my $self = shift (@_);
153
154	return "";
155	}
156
157	sub get_default_process_exp {
158	my $self = shift (@_);
159
160	return "";
161	}
162
163	# The BasPlug read() function. This function does all the right things
164	# to make general options work for a given plugin. It calls the process()
165	# function which does all the work specific to a plugin (like the old
166	# read functions used to do). Most plugins should define their own
167	# process() function and let this read() function keep control.
168	#
169	# recursive plugins (e.g. RecPlug) and specialized plugins like those
170	# capable of processing many documents within a single file (e.g.
171	# GMLPlug) should normally implement their own version of read()
172	#
173	# Return number of files processed, undef if can't process
174	# Note that $base_dir might be "" and that $file might
175	# include directories
176
177	sub read {
178	my $self = shift (@_);
179	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
180
181	if ($self->is_recursive()) {
182	die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
183	}
184
185	my $filename = &util::filename_cat($base_dir, $file);
186	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
187	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
188	return undef;
189	}
190	my $plugin_name = ref ($self);
191	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
192
193	# create a new document
194	my $doc_obj = new doc ($filename, "indexed_doc");
195
196	# read in file ($text will be in utf8)
197	my $text = "";
198	$self->read_file ($filename, \$text);
199
200	if ($text !~ /\w/) {
201	print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
202	return 0;
203	}
204
205	# include any metadata passed in from previous plugins
206	# note that this metadata is associated with the top level section
207	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
208
209	# do plugin specific processing of doc_obj
210	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
211
212	# do any automatic metadata extraction
213	$self->auto_extract_metadata ($doc_obj);
214
215	# add an OID
216	$doc_obj->set_OID();
217
218	# process the document
219	$processor->process($doc_obj);
220
221	return 1; # processed the file
222	}
223
224	# returns undef if file is rejected by the plugin
225	sub process {
226	my $self = shift (@_);
227	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
228
229	die "Basplug::process function must be implemented in sub-class\n";
230
231	return undef; # never gets here
232	}
233
234	# uses the multiread package to read in the entire file pointed to
235	# by filename and loads the resulting text into $$textref. Input text
236	# may be in any of the encodings handled by multiread, output text
237	# will be in utf8
238	sub read_file {
239	my $self = shift (@_);
240	my ($filename, $textref) = @_;
241
242	$$textref = "";
243
244	open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
245
246	if ($self->{'input_encoding'} eq "ascii") {
247	undef $/;
248	$$textref = <FILE>;
249	$/ = "\n";
250	} else {
251	my $reader = new multiread();
252	$reader->set_handle ('BasPlug::FILE');
253	$reader->set_encoding ($self->{'input_encoding'});
254	$reader->read_file ($textref);
255
256	if ($self->{'input_encoding'} eq "gb") {
257	# segment the Chinese words
258	$$textref = &cnseg::segment($$textref);
259	}
260	}
261
262	close FILE;
263	}
264
265	# add any extra metadata that's been passed around from one
266	# plugin to another.
267	# extra_metadata uses add_utf8_metadata so it expects metadata values
268	# to already be in utf8
269	sub extra_metadata {
270	my $self = shift (@_);
271	my ($doc_obj, $cursection, $metadata) = @_;
272
273	foreach my $field (keys(%$metadata)) {
274	# $metadata->{$field} may be an array reference
275	if (ref ($metadata->{$field}) eq "ARRAY") {
276	map {
277	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
278	} @{$metadata->{$field}};
279	} else {
280	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
281	}
282	}
283	}
284
285	# extract acronyms (and hopefully other stuff soon too).
286	sub auto_extract_metadata {
287	my $self = shift (@_);
288	my ($doc_obj) = @_;
289
290	if ($self->{'extract_acronyms'}) {
291	my $thissection = $doc_obj->get_top_section();
292	while (defined $thissection) {
293	my $text = $doc_obj->get_text($thissection);
294	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
295	$thissection = $doc_obj->get_next_section ($thissection);
296	}
297	}
298
299	if ($self->{'extract_language'}) {
300	my $thissection = $doc_obj->get_top_section();
301	while (defined $thissection) {
302	my $text = $doc_obj->get_text($thissection);
303	$self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
304	$thissection = $doc_obj->get_next_section ($thissection);
305	}
306	}
307
308	}
309
310
311	# Identify the language of a section and add it to the metadata
312	sub extract_language {
313	my $self = shift (@_);
314	my ($textref, $doc_obj, $thissection) = @_;
315
316	# remove all HTML tags
317	my $text = $$textref;
318	$text =~ s/<P[^>]*>/\n/sgi;
319	$text =~ s/<H[^>]*>/\n/sgi;
320	$text =~ s/<[^>]*>//sgi;
321	$text =~ tr/\n/\n/s;
322
323	# get the language
324	my @results = textcat::classify($text);
325	@results = ("unknown") if ($#results > 2);
326
327	my $language = join(" or ", @results);
328	$doc_obj->add_utf8_metadata($thissection, "Language", $language);
329	print "Language: $language\n";
330
331	}
332
333	# extract acronyms from a section in a document. progress is
334	# reported to STDERR based on the verbosity. both the Acronym
335	# and the AcronymKWIC metadata items are created.
336
337	sub extract_acronyms {
338	my $self = shift (@_);
339	my ($textref, $doc_obj, $thissection) = @_;
340
341	print STDERR " checking for acronyms ...\n"
342	if ($self->{'verbosity'} >= 2);
343
344	my $acro_array = &acronym::acronyms($textref);
345
346	foreach my $acro (@$acro_array) {
347
348	#check that this is the first time ...
349	my $seen_before = "false";
350	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
351	foreach my $thisAcro (@$previous_data) {
352	if ($thisAcro eq $acro->to_string())
353	{
354	$seen_before = "true";
355	print STDERR " already seen ". $acro->to_string() . "\n"
356	if ($self->{'verbosity'} >= 2);
357	}
358	}
359
360	if ($seen_before eq "false")
361	{
362	#do the normal acronym
363	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
364	print STDERR " adding ". $acro->to_string() . "\n"
365	if ($self->{'verbosity'} >= 1);
366
367	# do the KWIC (Key Word In Context) acronym
368	my @kwic = $acro->to_string_kwic();
369	foreach my $kwic (@kwic) {
370	$doc_obj->add_utf8_metadata($thissection, "AcronymKWIC", $kwic);
371	print STDERR " adding ". $kwic . "\n"
372	if ($self->{'verbosity'} >= 2);
373	}
374	}
375	}
376	print STDERR " done with acronyms. \n"
377	if ($self->{'verbosity'} >= 2);
378	}
379
380	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: