Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1838

Last change on this file since 1838 was 1838, checked in by sjboddie, 23 years ago
Added support for Cyrillic languages (windows codepage 1251) - yet to be tested by anyone who understands any such language though ;)
Property svn:keywords set to `Author Date Id Revision`
File size: 17.6 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# BasPlug.pm -- base class for all the import plugins
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
	26	package BasPlug;
	27
[1219]	28	use parsargv;
	29	use multiread;
	30	use cnseg;
[1242]	31	use acronym;
[1317]	32	use textcat;
[1242]	33	use doc;
[1360]	34	use diagnostics;
[1411]	35	use DateExtract;
[4]	36
[1244]	37	sub print_general_usage {
[1242]	38	my ($plugin_name) = @_;
	39
	40	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
	41	print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
	42	print STDERR " converted from these encodings and stored internally as\n";
[1317]	43	print STDERR " utf8. The default input_encoding is ascii. Accepted values\n";
[1242]	44	print STDERR " are:\n";
	45	print STDERR " iso_8859_1 (extended ascii)\n";
	46	print STDERR " Latin1 (the same as iso-8859-1)\n";
	47	print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n";
	48	print STDERR " conversion is neccessary)\n";
	49	print STDERR " gb (GB or GBK simplified Chinese)\n";
	50	print STDERR " iso_8859_6 (8 bit Arabic)\n";
	51	print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n";
	52	print STDERR " Arabic (the same as windows_1256)\n";
	53	print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n";
	54	print STDERR " unicode (just unicode -- doesn't currently do endian\n";
	55	print STDERR " detection)\n";
[1838]	56	print STDERR " windows_1251 (Windows codepage 1251 (Cyrillic))\n";
[1242]	57	print STDERR " -process_exp A perl regular expression to match against filenames.\n";
	58	print STDERR " Matching filenames will be processed by this plugin.\n";
	59	print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
	60	print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
	61	print STDERR " .htm or .html (case-insensitive).\n";
	62	print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
	63	print STDERR " being passed to any further plugins in the list. This has no\n";
	64	print STDERR " real effect other than to prevent lots of warning messages\n";
	65	print STDERR " about input files you don't care about. Each plugin may or may\n";
	66	print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n";
[1686]	67	print STDERR " any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
[1242]	68	print STDERR " file extensions.\n";
	69	print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
[1393]	70	print STDERR " -markup_acronyms Added acronym metadata into document text\n\n";
[1317]	71	print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n";
[1602]	72	print STDERR " -first Comma seperated list of first sizes to extract from the text \n";
	73	print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n";
	74	print STDERR " -extract_email Extract email addresses as metadata\n\n";
[1720]	75	print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n\n";
[1219]	76	}
	77
[1244]	78	# print_usage should be overridden for any sub-classes having
	79	# their own plugin specific options
	80	sub print_usage {
	81	print STDERR "\nThis plugin has no plugin specific options\n\n";
	82
	83	}
	84
[4]	85	sub new {
[1219]	86	my $class = shift (@_);
[1242]	87	my $plugin_name = shift (@_);
[4]	88
[1219]	89	my $self = {};
[1838]	90	my $encodings = "^(iso_8859_1\|Latin1\|ascii\|gb\|iso_8859_6\|windows_1256\|Arabic\|utf8\|unicode\|windows_1251)\$";
[1424]	91	$self->{'outhandle'} = STDERR;
[1411]	92	my $year = (localtime)[5]+1900;
[1424]	93
[1219]	94	# general options available to all plugins
[1242]	95	if (!parsargv::parse(\@_,
[1317]	96	qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},
[1242]	97	q^process_exp/.*/^, \$self->{'process_exp'},
	98	q^block_exp/.*/^, \$self->{'block_exp'},
	99	q^extract_acronyms^, \$self->{'extract_acronyms'},
[1602]	100	q^extract_email^, \$self->{'extract_email'},
[1393]	101	q^markup_acronyms^, \$self->{'markup_acronyms'},
[1317]	102	q^extract_language^, \$self->{'extract_language'},
[1605]	103	q^first/.*/^, \$self->{'first'},
[1718]	104	q^extract_date^, \$self->{'date_extract'},
[1411]	105	"maximum_date/\\d{4}/$year", \$self->{'max_year'},
	106	q^no_bibliography^, \$self->{'no_biblio'},
	107	"maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1",
	108	\$self->{'max_century'},
[1219]	109	"allow_extra_options")) {
[1411]	110
[1244]	111	print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
	112	print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
	113	&print_general_usage($plugin_name);
[1219]	114	die "\n";
	115	}
	116
	117	return bless $self, $class;
[4]	118	}
	119
[1242]	120	# initialize BasPlug options
	121	# if init() is overridden in a sub-class, remember to call BasPlug::init()
	122	sub init {
	123	my $self = shift (@_);
[1424]	124	my ($verbosity, $outhandle) = @_;
[1242]	125
	126	# verbosity is passed through from the processor
	127	$self->{'verbosity'} = $verbosity;
	128
[1424]	129	# as is the outhandle ...
	130	$self->{'outhandle'} = $outhandle if defined $outhandle;
	131
[1242]	132	# set process_exp and block_exp to defaults unless they were
	133	# explicitly set
[1244]	134
	135	if ((!$self->is_recursive()) and
[1242]	136	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	137
[1242]	138	$self->{'process_exp'} = $self->get_default_process_exp ();
	139	if ($self->{'process_exp'} eq "") {
[1244]	140	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	141	}
	142	}
	143
	144	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	145	$self->{'block_exp'} = $self->get_default_block_exp ();
	146	}
[1244]	147
	148	# handle input_encoding aliases
	149	$self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
	150	$self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
[1242]	151	}
	152
[839]	153	sub begin {
	154	my $self = shift (@_);
	155	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[1396]	156	$self->initialise_extractors();
[839]	157	}
	158
	159	sub end {
	160	my ($self) = @_;
[1396]	161	$self->finalise_extractors();
[839]	162	}
	163
[1242]	164	# this function should be overridden to return 1
	165	# in recursive plugins
[4]	166	sub is_recursive {
	167	my $self = shift (@_);
	168
[1242]	169	return 0;
[4]	170	}
	171
[1242]	172	sub get_default_block_exp {
	173	my $self = shift (@_);
	174
	175	return "";
	176	}
	177
	178	sub get_default_process_exp {
	179	my $self = shift (@_);
	180
	181	return "";
	182	}
	183
	184	# The BasPlug read() function. This function does all the right things
	185	# to make general options work for a given plugin. It calls the process()
	186	# function which does all the work specific to a plugin (like the old
	187	# read functions used to do). Most plugins should define their own
	188	# process() function and let this read() function keep control.
	189	#
[1244]	190	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	191	# capable of processing many documents within a single file (e.g.
	192	# GMLPlug) should normally implement their own version of read()
	193	#
[1242]	194	# Return number of files processed, undef if can't process
[4]	195	# Note that $base_dir might be "" and that $file might
	196	# include directories
[1242]	197
[4]	198	sub read {
	199	my $self = shift (@_);
[317]	200	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
[4]	201
[1242]	202	if ($self->is_recursive()) {
	203	die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
	204	}
[4]	205
[1242]	206	my $filename = &util::filename_cat($base_dir, $file);
[1244]	207	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
[1242]	208	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	209	return undef;
	210	}
	211	my $plugin_name = ref ($self);
	212	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
	213
	214	# create a new document
[1379]	215	my $doc_obj = new doc ($filename, "indexed_doc");
[1242]	216
	217	# read in file ($text will be in utf8)
	218	my $text = "";
	219	$self->read_file ($filename, \$text);
	220
	221	if ($text !~ /\w/) {
[1424]	222	my $outhandle = $self->{'outhandle'};
	223	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
[1242]	224	return 0;
	225	}
	226
	227	# include any metadata passed in from previous plugins
	228	# note that this metadata is associated with the top level section
	229	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	230
	231	# do plugin specific processing of doc_obj
[1244]	232	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
[1242]	233
	234	# do any automatic metadata extraction
	235	$self->auto_extract_metadata ($doc_obj);
	236
	237	# add an OID
	238	$doc_obj->set_OID();
	239
	240	# process the document
	241	$processor->process($doc_obj);
	242
	243	return 1; # processed the file
[4]	244	}
	245
[1244]	246	# returns undef if file is rejected by the plugin
[1242]	247	sub process {
	248	my $self = shift (@_);
	249	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	250
	251	die "Basplug::process function must be implemented in sub-class\n";
[1244]	252
	253	return undef; # never gets here
[1242]	254	}
	255
[1219]	256	# uses the multiread package to read in the entire file pointed to
	257	# by filename and loads the resulting text into $$textref. Input text
	258	# may be in any of the encodings handled by multiread, output text
	259	# will be in utf8
	260	sub read_file {
	261	my $self = shift (@_);
	262	my ($filename, $textref) = @_;
[4]	263
[1756]	264	if (!-r $filename)
	265	{
	266	print STDERR "Read permission denied for $filename\n";
	267	return;
	268	}
	269
[1219]	270	$$textref = "";
	271
	272	open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
	273
[1244]	274	if ($self->{'input_encoding'} eq "ascii") {
[1219]	275	undef $/;
	276	$$textref = <FILE>;
	277	$/ = "\n";
	278	} else {
	279	my $reader = new multiread();
	280	$reader->set_handle ('BasPlug::FILE');
[1244]	281	$reader->set_encoding ($self->{'input_encoding'});
[1219]	282	$reader->read_file ($textref);
	283
[1244]	284	if ($self->{'input_encoding'} eq "gb") {
[1219]	285	# segment the Chinese words
	286	$$textref = &cnseg::segment($$textref);
	287	}
	288	}
	289
	290	close FILE;
	291	}
	292
	293	# add any extra metadata that's been passed around from one
	294	# plugin to another.
	295	# extra_metadata uses add_utf8_metadata so it expects metadata values
	296	# to already be in utf8
	297	sub extra_metadata {
	298	my $self = shift (@_);
	299	my ($doc_obj, $cursection, $metadata) = @_;
	300
	301	foreach my $field (keys(%$metadata)) {
[839]	302	# $metadata->{$field} may be an array reference
	303	if (ref ($metadata->{$field}) eq "ARRAY") {
	304	map {
[1219]	305	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]	306	} @{$metadata->{$field}};
	307	} else {
[1219]	308	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]	309	}
	310	}
	311	}
	312
[1396]	313	# initialise metadata extractors
	314	sub initialise_extractors {
	315	my $self = shift (@_);
	316
	317	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	318	&acronym::initialise_acronyms();
	319	}
	320	}
	321
	322	# finalise metadata extractors
	323	sub finalise_extractors {
	324	my $self = shift (@_);
	325
	326	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	327	&acronym::finalise_acronyms();
	328	}
	329	}
	330
[1602]	331	# FIRSTNNN: extract the first NNN characters as metadata
	332	sub extract_first_NNNN_characters {
	333	my $self = shift (@_);
	334	my ($textref, $doc_obj, $thissection) = @_;
	335
	336	foreach my $size (split /,/, $self->{'first'}) {
	337	my $tmptext = $$textref;
	338	$tmptext =~ s/^\s+//;
	339	$tmptext =~ s/\s+$//;
	340	$tmptext =~ s/\s+/ /gs;
	341	$tmptext = substr ($tmptext, 0, $size);
	342	$tmptext =~ s/\s\S*$/…/;
	343	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	344	}
	345	}
	346
	347	sub extract_email {
	348	my $self = shift (@_);
	349	my ($textref, $doc_obj, $thissection) = @_;
	350	my $outhandle = $self->{'outhandle'};
	351
	352	print $outhandle " extracting email addresses ...\n"
	353	if ($self->{'verbosity'} >= 2);
	354
	355	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|[a-z][a-z]))/g);
	356	@email = sort @email;
	357
	358	my @email2 = ();
	359	foreach my $address (@email) {
	360	if (!(join(" ",@email2) =~ m/$address/ )) {
	361	push @email2, $address;
	362	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
	363	print $outhandle " extracting $address\n"
	364	if ($self->{'verbosity'} >= 3);
	365	}
	366	}
	367	print $outhandle " done extracting email addresses.\n"
	368	if ($self->{'verbosity'} >= 2);
	369
	370	}
	371
	372	# extract metadata
[1242]	373	sub auto_extract_metadata {
	374	my $self = shift (@_);
	375	my ($doc_obj) = @_;
[1602]	376
	377	if ($self->{'extract_email'}) {
	378	my $thissection = $doc_obj->get_top_section();
	379	while (defined $thissection) {
	380	my $text = $doc_obj->get_text($thissection);
	381	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
	382	$thissection = $doc_obj->get_next_section ($thissection);
	383	}
	384	}
	385	if ($self->{'first'}) {
	386	my $thissection = $doc_obj->get_top_section();
	387	while (defined $thissection) {
	388	my $text = $doc_obj->get_text($thissection);
	389	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
	390	$thissection = $doc_obj->get_next_section ($thissection);
	391	}
	392	}
	393
[1242]	394	if ($self->{'extract_acronyms'}) {
	395	my $thissection = $doc_obj->get_top_section();
	396	while (defined $thissection) {
	397	my $text = $doc_obj->get_text($thissection);
	398	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
	399	$thissection = $doc_obj->get_next_section ($thissection);
	400	}
	401	}
[1602]	402
[1393]	403	if ($self->{'markup_acronyms'}) {
	404	my $thissection = $doc_obj->get_top_section();
	405	while (defined $thissection) {
	406	my $text = $doc_obj->get_text($thissection);
	407	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
	408	$doc_obj->delete_text($thissection);
	409	$doc_obj->add_text($thissection, $text);
	410	$thissection = $doc_obj->get_next_section ($thissection);
	411	}
	412	}
	413
[1411]	414	if($self->{'date_extract'}) {
	415	my $thissection = $doc_obj->get_top_section();
	416	while (defined $thissection) {
	417
	418	my $text = $doc_obj->get_text($thissection);
	419	&DateExtract::get_date_metadata($text, $doc_obj,
	420	$thissection,
	421	$self->{'no_biblio'},
	422	$self->{'max_year'},
	423	$self->{'max_century'});
	424	$thissection = $doc_obj->get_next_section ($thissection);
	425	}
	426	}
	427
[1317]	428	if ($self->{'extract_language'}) {
	429	my $thissection = $doc_obj->get_top_section();
	430	while (defined $thissection) {
	431	my $text = $doc_obj->get_text($thissection);
	432	$self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
	433	$thissection = $doc_obj->get_next_section ($thissection);
	434	}
	435	}
	436
[1242]	437	}
	438
[1317]	439
	440	# Identify the language of a section and add it to the metadata
	441	sub extract_language {
	442	my $self = shift (@_);
	443	my ($textref, $doc_obj, $thissection) = @_;
	444
	445	# remove all HTML tags
	446	my $text = $$textref;
	447	$text =~ s/<P[^>]*>/\n/sgi;
	448	$text =~ s/<H[^>]*>/\n/sgi;
	449	$text =~ s/<[^>]*>//sgi;
	450	$text =~ tr/\n/\n/s;
	451
	452	# get the language
	453	my @results = textcat::classify($text);
	454	@results = ("unknown") if ($#results > 2);
	455
[1384]	456	# create language string and remove encoding information
[1317]	457	my $language = join(" or ", @results);
[1384]	458	$language =~ s/\-\w+//g;
[1317]	459	$doc_obj->add_utf8_metadata($thissection, "Language", $language);
[1384]	460	# print "Language: ", time, "-> $language\n";
[1317]	461
	462	}
	463
[1335]	464	# extract acronyms from a section in a document. progress is
[1424]	465	# reported to outhandle based on the verbosity. both the Acronym
[1335]	466	# and the AcronymKWIC metadata items are created.
	467
[1242]	468	sub extract_acronyms {
	469	my $self = shift (@_);
	470	my ($textref, $doc_obj, $thissection) = @_;
[1424]	471	my $outhandle = $self->{'outhandle'};
[1242]	472
[1424]	473	print $outhandle " extracting acronyms ...\n"
[1335]	474	if ($self->{'verbosity'} >= 2);
	475
[1242]	476	my $acro_array = &acronym::acronyms($textref);
[1360]	477
[1242]	478	foreach my $acro (@$acro_array) {
	479
[1335]	480	#check that this is the first time ...
	481	my $seen_before = "false";
	482	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
	483	foreach my $thisAcro (@$previous_data) {
[1602]	484	if ($thisAcro eq $acro->to_string()) {
[1335]	485	$seen_before = "true";
[1424]	486	print $outhandle " already seen ". $acro->to_string() . "\n"
[1602]	487	if ($self->{'verbosity'} >= 4);
[1335]	488	}
[1242]	489	}
[1335]	490
[1602]	491	if ($seen_before eq "false") {
[1393]	492	#write it to the file ...
	493	$acro->write_to_file();
	494
[1335]	495	#do the normal acronym
	496	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[1602]	497	print $outhandle " adding ". $acro->to_string() . "\n"
	498	if ($self->{'verbosity'} >= 3);
[1335]	499
	500	}
[1242]	501	}
[1424]	502	print $outhandle " done extracting acronyms. \n"
[1335]	503	if ($self->{'verbosity'} >= 2);
[1242]	504	}
	505
[1393]	506	sub markup_acronyms {
	507	my $self = shift (@_);
	508	my ($text, $doc_obj, $thissection) = @_;
[1424]	509	my $outhandle = $self->{'outhandle'};
[1393]	510
[1424]	511	print $outhandle " marking up acronyms ...\n"
[1393]	512	if ($self->{'verbosity'} >= 2);
	513
	514	#self is passed in to check for verbosity ...
	515	$text = &acronym::markup_acronyms($text, $self);
	516
[1424]	517	print $outhandle " done marking up acronyms. \n"
[1393]	518	if ($self->{'verbosity'} >= 2);
	519
	520	return $text;
	521	}
	522
[4]	523	1;
[1602]	524
	525
	526

Note: See TracBrowser for help on using the repository browser.

Download in other formats: