Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 1844

Last change on this file since 1844 was 1844, checked in by sjboddie, 23 years ago
Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat is used to work out the language and encoding of each document prior to processing it. This allows for documents within the same collection to be in different encodings and all be imported correctly (as long as they're in an encoding that's supported - notable exceptions at the moment are Big5 Chinese and any kind of Japanese). Doing things this way means each document is read in twice at import time, no doubt slowing things down considerably. You can therefore still set -input_encoding explicitly if you know that all your documents are a particular encoding.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.3 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# BasPlug.pm -- base class for all the import plugins
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
	26	package BasPlug;
	27
[1219]	28	use parsargv;
	29	use multiread;
	30	use cnseg;
[1242]	31	use acronym;
[1317]	32	use textcat;
[1242]	33	use doc;
[1360]	34	use diagnostics;
[1411]	35	use DateExtract;
[1844]	36	use iso639;
[4]	37
[1844]	38	# if textcat returns an encoding that isn't in this list
	39	# we'll print a warning and use the default encoding instead
	40	%supported_encodings = (
	41	"ascii" => "",
	42	"iso_8859_1" => "",
	43	"windows_1252" => "",
	44	"iso_8859_2" => "",
	45	"windows_1250" => "",
	46	"iso_8859_3" => "",
	47	"iso_8859_4" => "",
	48	"iso_8859_5" => "",
	49	"windows_1251" => "",
	50	"koi8_r" => "",
	51	"koi8_u" => "",
	52	"iso_8859_6" => "",
	53	"windows_1256" => "",
	54	"iso_8859_7" => "",
	55	"windows_1253" => "",
	56	"iso_8859_8" => "",
	57	"windows_1255" => "",
	58	"iso_8859_9" => "",
	59	"windows_1254" => "",
	60	"gb" => ""
	61	);
	62
[1244]	63	sub print_general_usage {
[1242]	64	my ($plugin_name) = @_;
	65
	66	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
[1844]	67
[1242]	68	print STDERR " -process_exp A perl regular expression to match against filenames.\n";
	69	print STDERR " Matching filenames will be processed by this plugin.\n";
	70	print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
	71	print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
[1844]	72	print STDERR " .htm or .html (case-insensitive).\n\n";
	73
[1242]	74	print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
	75	print STDERR " being passed to any further plugins in the list. This has no\n";
	76	print STDERR " real effect other than to prevent lots of warning messages\n";
	77	print STDERR " about input files you don't care about. Each plugin may or may\n";
	78	print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n";
[1686]	79	print STDERR " any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n";
[1844]	80	print STDERR " file extensions.\n\n";
	81
	82
	83	print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
	84	print STDERR " converted from these encodings and stored internally as\n";
	85	print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
	86	print STDERR " are:\n";
	87
	88	print STDERR " auto: Use text categorization algorithm to automatically\n";
	89	print STDERR " identify the encoding of each source document. This\n";
	90	print STDERR " will be slower than explicitly setting the encoding\n";
	91	print STDERR " but will work where more than one encoding is used\n";
	92	print STDERR " within the same collection.\n";
	93
	94	print STDERR " ascii: Plain 7 bit ascii. This may be a little faster than\n";
	95	print STDERR " using iso_8859_1. Beware of using 'ascii' on a collection\n";
	96	print STDERR " of documents that may contain characters outside of plain\n";
	97	print STDERR " 7 bit ascii though (e.g. German or French documents\n";
	98	print STDERR " containing accents), use iso_8859_1 instead.\n";
	99
	100	print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
	101	print STDERR " unicode: just unicode\n";
	102
	103	print STDERR " iso_8859_1: Latin1 (western european languages)\n";
	104	print STDERR " windows_1252: Windows codepage 1252 (WinLatin1)\n";
	105
	106	print STDERR " iso_8859_2: Latin2 (central and eastern european languages)\n";
	107	print STDERR " windows_1250: Windows codepage 1250 (WinLatin2)\n";
	108
	109	print STDERR " iso_8859_3: Latin3\n";
	110
	111	print STDERR " iso_8859_4: Latin4\n";
	112
	113	print STDERR " iso_8859_5: Cyrillic\n";
	114	print STDERR " windows_1251: Windows codepage 1251 (WinCyrillic)\n";
	115	print STDERR " koi8_r: Cyrillic - Russian\n";
	116	print STDERR " koi8_u: Cyrillic - Ukrainian\n";
	117
	118	print STDERR " iso_8859_6: Arabic\n";
	119	print STDERR " windows_1256: Windows codepage 1256 (WinArabic)\n";
	120
	121	print STDERR " iso_8859_7: Greek\n";
	122	print STDERR " windows_1253: Windows codepage 1253 (WinGreek)\n";
	123
	124	print STDERR " iso_8859_8: Hebrew\n";
	125	print STDERR " windows_1255: Windows codepage 1255 (WinHebrew)\n";
	126
	127	print STDERR " iso_8859_9: Latin5\n";
	128	print STDERR " windows_1254: Windows codepage 1254 (WinTurkish)\n";
	129
	130	print STDERR " gb: GB or GBK simplified Chinese\n\n";
	131
	132	print STDERR " -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
	133	print STDERR " algorithm fails to extract the encoding or extracts an encoding\n";
	134	print STDERR " that is not supported by Greenstone, this encoding will be used\n";
	135	print STDERR " instead. The default is iso_8859_1\n\n";
	136
	137	print STDERR " -extract_language Identify the language of each document and set 'Language' metadata. Note\n";
	138	print STDERR " that this will be done automatically if -input_encoding is 'auto'.\n";
	139	print STDERR " -default_language If Greenstone fails to work out what language a document is the\n";
	140	print STDERR " 'Language' metadata element will be set to this value. The default\n";
	141	print STDERR " is 'en' (ISO 639 language symbols should be used - en = English).\n";
	142	print STDERR " Note that if -input_encoding is not set to 'auto' and -extract_language\n";
	143	print STDERR " is not set, all documents will have their 'Language' metadata set to\n";
	144	print STDERR " this value.\n\n";
	145
[1242]	146	print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
[1844]	147
	148	print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
	149
	150	print STDERR " -first Comma seperated list of first sizes to extract from the text\n";
	151	print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n\n";
	152
[1602]	153	print STDERR " -extract_email Extract email addresses as metadata\n\n";
[1844]	154
[1720]	155	print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n\n";
[1219]	156	}
	157
[1244]	158	# print_usage should be overridden for any sub-classes having
	159	# their own plugin specific options
	160	sub print_usage {
	161	print STDERR "\nThis plugin has no plugin specific options\n\n";
	162
	163	}
	164
[4]	165	sub new {
[1219]	166	my $class = shift (@_);
[1242]	167	my $plugin_name = shift (@_);
[1219]	168	my $self = {};
[1844]	169
	170	my $enc = "^(";
	171	map {$enc .= "\|$_";} keys %supported_encodings;
	172	my $denc = $enc . "\|utf8\|unicode)\$";
	173	$enc .= "\|utf8\|unicode\|auto)\$";
	174
[1424]	175	$self->{'outhandle'} = STDERR;
[1411]	176	my $year = (localtime)[5]+1900;
[1424]	177
[1219]	178	# general options available to all plugins
[1242]	179	if (!parsargv::parse(\@_,
	180	q^process_exp/.*/^, \$self->{'process_exp'},
	181	q^block_exp/.*/^, \$self->{'block_exp'},
[1844]	182	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
	183	qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
[1242]	184	q^extract_acronyms^, \$self->{'extract_acronyms'},
[1602]	185	q^extract_email^, \$self->{'extract_email'},
[1393]	186	q^markup_acronyms^, \$self->{'markup_acronyms'},
[1317]	187	q^extract_language^, \$self->{'extract_language'},
[1844]	188	q^default_language/.{2}/en^, \$self->{'default_language'},
[1605]	189	q^first/.*/^, \$self->{'first'},
[1718]	190	q^extract_date^, \$self->{'date_extract'},
[1844]	191	qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
[1411]	192	q^no_bibliography^, \$self->{'no_biblio'},
[1844]	193	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
[1219]	194	"allow_extra_options")) {
[1411]	195
[1244]	196	print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
	197	print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
	198	&print_general_usage($plugin_name);
[1219]	199	die "\n";
	200	}
	201
	202	return bless $self, $class;
[4]	203	}
	204
[1242]	205	# initialize BasPlug options
	206	# if init() is overridden in a sub-class, remember to call BasPlug::init()
	207	sub init {
	208	my $self = shift (@_);
[1424]	209	my ($verbosity, $outhandle) = @_;
[1242]	210
	211	# verbosity is passed through from the processor
	212	$self->{'verbosity'} = $verbosity;
	213
[1424]	214	# as is the outhandle ...
	215	$self->{'outhandle'} = $outhandle if defined $outhandle;
	216
[1242]	217	# set process_exp and block_exp to defaults unless they were
	218	# explicitly set
[1244]	219
	220	if ((!$self->is_recursive()) and
[1242]	221	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
[1244]	222
[1242]	223	$self->{'process_exp'} = $self->get_default_process_exp ();
	224	if ($self->{'process_exp'} eq "") {
[1244]	225	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
[1242]	226	}
	227	}
	228
	229	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
	230	$self->{'block_exp'} = $self->get_default_block_exp ();
	231	}
	232	}
	233
[839]	234	sub begin {
	235	my $self = shift (@_);
	236	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
[1396]	237	$self->initialise_extractors();
[839]	238	}
	239
	240	sub end {
	241	my ($self) = @_;
[1396]	242	$self->finalise_extractors();
[839]	243	}
	244
[1242]	245	# this function should be overridden to return 1
	246	# in recursive plugins
[4]	247	sub is_recursive {
	248	my $self = shift (@_);
	249
[1242]	250	return 0;
[4]	251	}
	252
[1242]	253	sub get_default_block_exp {
	254	my $self = shift (@_);
	255
	256	return "";
	257	}
	258
	259	sub get_default_process_exp {
	260	my $self = shift (@_);
	261
	262	return "";
	263	}
	264
	265	# The BasPlug read() function. This function does all the right things
	266	# to make general options work for a given plugin. It calls the process()
	267	# function which does all the work specific to a plugin (like the old
	268	# read functions used to do). Most plugins should define their own
	269	# process() function and let this read() function keep control.
	270	#
[1244]	271	# recursive plugins (e.g. RecPlug) and specialized plugins like those
	272	# capable of processing many documents within a single file (e.g.
	273	# GMLPlug) should normally implement their own version of read()
	274	#
[1242]	275	# Return number of files processed, undef if can't process
[4]	276	# Note that $base_dir might be "" and that $file might
	277	# include directories
[1242]	278
[4]	279	sub read {
	280	my $self = shift (@_);
[317]	281	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
[4]	282
[1242]	283	if ($self->is_recursive()) {
	284	die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
	285	}
[4]	286
[1844]	287	my $outhandle = $self->{'outhandle'};
	288
[1242]	289	my $filename = &util::filename_cat($base_dir, $file);
[1244]	290	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
[1242]	291	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
	292	return undef;
	293	}
	294	my $plugin_name = ref ($self);
	295	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[1844]	296
	297	my ($language, $encoding);
	298	if ($self->{'input_encoding'} eq "auto") {
	299	# use textcat to automatically work out the input encoding and language
	300	($language, $encoding) = $self->get_language_encoding ($filename);
	301
	302	} elsif ($self->{'extract_language'}) {
	303	# use textcat to get language metadata
	304	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
	305	$encoding = $self->{'input_encoding'};
	306
	307	if ($extracted_encoding != $encoding && $self->{'verbosity'}) {
	308	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
	309	print $outhandle "appears to be encoded as $extracted_encoding.";
	310	}
	311
	312	} else {
	313	$language = $self->{'default_language'};
	314	$encoding = $self->{'input_encoding'};
	315	}
	316
[1242]	317	# create a new document
[1379]	318	my $doc_obj = new doc ($filename, "indexed_doc");
[1844]	319	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
	320	$doc_obj->set_source_encoding ($encoding);
[1242]	321
[1844]	322
[1242]	323	# read in file ($text will be in utf8)
	324	my $text = "";
[1844]	325	$self->read_file ($filename, $encoding, \$text);
[1242]	326
[1844]	327	if (!length ($text)) {
[1424]	328	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
[1242]	329	return 0;
	330	}
	331
	332	# include any metadata passed in from previous plugins
	333	# note that this metadata is associated with the top level section
	334	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
	335
	336	# do plugin specific processing of doc_obj
[1244]	337	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
[1242]	338
	339	# do any automatic metadata extraction
	340	$self->auto_extract_metadata ($doc_obj);
	341
	342	# add an OID
	343	$doc_obj->set_OID();
	344
	345	# process the document
	346	$processor->process($doc_obj);
	347
	348	return 1; # processed the file
[4]	349	}
	350
[1244]	351	# returns undef if file is rejected by the plugin
[1242]	352	sub process {
	353	my $self = shift (@_);
	354	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
	355
	356	die "Basplug::process function must be implemented in sub-class\n";
[1244]	357
	358	return undef; # never gets here
[1242]	359	}
	360
[1219]	361	# uses the multiread package to read in the entire file pointed to
	362	# by filename and loads the resulting text into $$textref. Input text
	363	# may be in any of the encodings handled by multiread, output text
	364	# will be in utf8
	365	sub read_file {
	366	my $self = shift (@_);
[1844]	367	my ($filename, $encoding, $textref) = @_;
[4]	368
[1756]	369	if (!-r $filename)
	370	{
[1844]	371	my $outhandle = $self->{'outhandle'};
	372	print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
[1756]	373	return;
	374	}
	375
[1219]	376	$$textref = "";
	377
	378	open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
	379
[1844]	380	if ($encoding eq "ascii") {
[1219]	381	undef $/;
	382	$$textref = <FILE>;
	383	$/ = "\n";
	384	} else {
	385	my $reader = new multiread();
	386	$reader->set_handle ('BasPlug::FILE');
[1844]	387	$reader->set_encoding ($encoding);
[1219]	388	$reader->read_file ($textref);
	389
[1844]	390	if ($encoding eq "gb") {
[1219]	391	# segment the Chinese words
	392	$$textref = &cnseg::segment($$textref);
	393	}
	394	}
	395
	396	close FILE;
	397	}
	398
[1844]	399	# Uses textcat to work out the encoding and language of the text in
	400	# $filename. All html tags are removed before processing.
	401	# returns an array containing "language" and "encoding"
	402	sub get_language_encoding {
	403	my $self = shift (@_);
	404	my ($filename) = @_;
	405	my $outhandle = $self->{'outhandle'};
	406
	407	# read in file
	408	open (FILE, $filename) \|\| die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
	409	undef $/;
	410	my $text = <FILE>;
	411	$/ = "\n";
	412	close FILE;
	413
	414	# remove all HTML tags
	415	$text =~ s/<[^>]*>//sg;
	416
	417	# get the language/encoding
	418	my @results = textcat::classify($text);
	419
	420	# foreach $i (@results) {
	421	# print STDERR "i: $i\n";
	422	# }
	423
	424	if (scalar @results != 1) {
	425	if ($self->{'input_encoding'} ne 'auto') {
	426	if ($self->{'extract_language'} && $self->{'verbosity'}) {
	427	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
	428	print $outhandle "defaulting to $self->{'default_language'}\n";
	429	}
	430	return ($self->{'default_language'}, $self->{'input_encoding'});
	431
	432	} else {
	433	if ($self->{'verbosity'}) {
	434	print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
	435	print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
	436	}
	437	return ($self->{'default_language'}, $self->{'default_encoding'});
	438	}
	439	}
	440
	441	# format language/encoding
	442	my ($language, $encoding) = $results[0] =~ /^([^-])(?:-(.))?$/;
	443	$language = $iso639::toiso639{lc($language)};
	444	die "Invalid language\n" if !defined $language;
	445
	446	if (!defined $encoding) {
	447	# if textcat returned no encoding info it is assumed to be iso_8859_1
	448	$encoding = "iso_8859_1";
	449	} else {
	450	# convert to the format we expect
	451	$encoding =~ s/windows/windows_/;
	452	$encoding =~ s/iso8859/iso_8859/;
	453	$encoding =~ s/^gb.*$/gb/;
	454	}
	455
	456	if (!defined $supported_encodings{$encoding}) {
	457	if ($self->{'verbosity'}) {
	458	print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
	459	print $outhandle "using $self->{'default_encoding'}\n";
	460	}
	461	$encoding = $self->{'default_encoding'};
	462	}
	463
	464	return ($language, $encoding);
	465	}
	466
[1219]	467	# add any extra metadata that's been passed around from one
	468	# plugin to another.
	469	# extra_metadata uses add_utf8_metadata so it expects metadata values
	470	# to already be in utf8
	471	sub extra_metadata {
	472	my $self = shift (@_);
	473	my ($doc_obj, $cursection, $metadata) = @_;
	474
	475	foreach my $field (keys(%$metadata)) {
[839]	476	# $metadata->{$field} may be an array reference
	477	if (ref ($metadata->{$field}) eq "ARRAY") {
	478	map {
[1219]	479	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
[839]	480	} @{$metadata->{$field}};
	481	} else {
[1219]	482	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
[839]	483	}
	484	}
	485	}
	486
[1396]	487	# initialise metadata extractors
	488	sub initialise_extractors {
	489	my $self = shift (@_);
	490
	491	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	492	&acronym::initialise_acronyms();
	493	}
	494	}
	495
	496	# finalise metadata extractors
	497	sub finalise_extractors {
	498	my $self = shift (@_);
	499
	500	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
	501	&acronym::finalise_acronyms();
	502	}
	503	}
	504
[1602]	505	# FIRSTNNN: extract the first NNN characters as metadata
	506	sub extract_first_NNNN_characters {
	507	my $self = shift (@_);
	508	my ($textref, $doc_obj, $thissection) = @_;
	509
	510	foreach my $size (split /,/, $self->{'first'}) {
	511	my $tmptext = $$textref;
	512	$tmptext =~ s/^\s+//;
	513	$tmptext =~ s/\s+$//;
	514	$tmptext =~ s/\s+/ /gs;
	515	$tmptext = substr ($tmptext, 0, $size);
	516	$tmptext =~ s/\s\S*$/…/;
	517	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
	518	}
	519	}
	520
	521	sub extract_email {
	522	my $self = shift (@_);
	523	my ($textref, $doc_obj, $thissection) = @_;
	524	my $outhandle = $self->{'outhandle'};
	525
	526	print $outhandle " extracting email addresses ...\n"
[1844]	527	if ($self->{'verbosity'} > 2);
[1602]	528
	529	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|[a-z][a-z]))/g);
	530	@email = sort @email;
	531
	532	my @email2 = ();
	533	foreach my $address (@email) {
	534	if (!(join(" ",@email2) =~ m/$address/ )) {
	535	push @email2, $address;
	536	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
	537	print $outhandle " extracting $address\n"
[1844]	538	if ($self->{'verbosity'} > 3);
[1602]	539	}
	540	}
	541	print $outhandle " done extracting email addresses.\n"
[1844]	542	if ($self->{'verbosity'} > 2);
[1602]	543
	544	}
	545
	546	# extract metadata
[1242]	547	sub auto_extract_metadata {
	548	my $self = shift (@_);
	549	my ($doc_obj) = @_;
[1602]	550
	551	if ($self->{'extract_email'}) {
	552	my $thissection = $doc_obj->get_top_section();
	553	while (defined $thissection) {
	554	my $text = $doc_obj->get_text($thissection);
	555	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
	556	$thissection = $doc_obj->get_next_section ($thissection);
	557	}
	558	}
	559	if ($self->{'first'}) {
	560	my $thissection = $doc_obj->get_top_section();
	561	while (defined $thissection) {
	562	my $text = $doc_obj->get_text($thissection);
	563	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
	564	$thissection = $doc_obj->get_next_section ($thissection);
	565	}
	566	}
	567
[1242]	568	if ($self->{'extract_acronyms'}) {
	569	my $thissection = $doc_obj->get_top_section();
	570	while (defined $thissection) {
	571	my $text = $doc_obj->get_text($thissection);
	572	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
	573	$thissection = $doc_obj->get_next_section ($thissection);
	574	}
	575	}
[1602]	576
[1393]	577	if ($self->{'markup_acronyms'}) {
	578	my $thissection = $doc_obj->get_top_section();
	579	while (defined $thissection) {
	580	my $text = $doc_obj->get_text($thissection);
	581	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
	582	$doc_obj->delete_text($thissection);
	583	$doc_obj->add_text($thissection, $text);
	584	$thissection = $doc_obj->get_next_section ($thissection);
	585	}
	586	}
	587
[1411]	588	if($self->{'date_extract'}) {
	589	my $thissection = $doc_obj->get_top_section();
	590	while (defined $thissection) {
	591
	592	my $text = $doc_obj->get_text($thissection);
	593	&DateExtract::get_date_metadata($text, $doc_obj,
	594	$thissection,
	595	$self->{'no_biblio'},
	596	$self->{'max_year'},
	597	$self->{'max_century'});
	598	$thissection = $doc_obj->get_next_section ($thissection);
	599	}
	600	}
	601
[1317]	602	if ($self->{'extract_language'}) {
	603	my $thissection = $doc_obj->get_top_section();
	604	while (defined $thissection) {
	605	my $text = $doc_obj->get_text($thissection);
	606	$self->extract_language (\$text, $doc_obj, $thissection) if $text =~ /./;
	607	$thissection = $doc_obj->get_next_section ($thissection);
	608	}
	609	}
	610
[1242]	611	}
	612
[1335]	613	# extract acronyms from a section in a document. progress is
[1424]	614	# reported to outhandle based on the verbosity. both the Acronym
[1335]	615	# and the AcronymKWIC metadata items are created.
	616
[1242]	617	sub extract_acronyms {
	618	my $self = shift (@_);
	619	my ($textref, $doc_obj, $thissection) = @_;
[1424]	620	my $outhandle = $self->{'outhandle'};
[1242]	621
[1424]	622	print $outhandle " extracting acronyms ...\n"
[1844]	623	if ($self->{'verbosity'} > 2);
[1335]	624
[1242]	625	my $acro_array = &acronym::acronyms($textref);
[1360]	626
[1242]	627	foreach my $acro (@$acro_array) {
	628
[1335]	629	#check that this is the first time ...
	630	my $seen_before = "false";
	631	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
	632	foreach my $thisAcro (@$previous_data) {
[1602]	633	if ($thisAcro eq $acro->to_string()) {
[1335]	634	$seen_before = "true";
[1424]	635	print $outhandle " already seen ". $acro->to_string() . "\n"
[1602]	636	if ($self->{'verbosity'} >= 4);
[1335]	637	}
[1242]	638	}
[1335]	639
[1602]	640	if ($seen_before eq "false") {
[1393]	641	#write it to the file ...
	642	$acro->write_to_file();
	643
[1335]	644	#do the normal acronym
	645	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
[1602]	646	print $outhandle " adding ". $acro->to_string() . "\n"
[1844]	647	if ($self->{'verbosity'} > 3);
[1335]	648
	649	}
[1242]	650	}
[1424]	651	print $outhandle " done extracting acronyms. \n"
[1844]	652	if ($self->{'verbosity'} > 2);
[1242]	653	}
	654
[1393]	655	sub markup_acronyms {
	656	my $self = shift (@_);
	657	my ($text, $doc_obj, $thissection) = @_;
[1424]	658	my $outhandle = $self->{'outhandle'};
[1393]	659
[1424]	660	print $outhandle " marking up acronyms ...\n"
[1844]	661	if ($self->{'verbosity'} > 2);
[1393]	662
	663	#self is passed in to check for verbosity ...
	664	$text = &acronym::markup_acronyms($text, $self);
	665
[1424]	666	print $outhandle " done marking up acronyms. \n"
[1844]	667	if ($self->{'verbosity'} > 2);
[1393]	668
	669	return $text;
	670	}
	671
[4]	672	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: