Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

Property svn:keywords set to Author Date Id Revision

File size: 24.4 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	# suppress the annoying "subroutine redefined" warning that various
29	# plugins cause under perl 5.6
30	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
31
32	use Kea;
33	use parsargv;
34	use multiread;
35	use encodings;
36	use cnseg;
37	use acronym;
38	use textcat;
39	use doc;
40	use diagnostics;
41	use DateExtract;
42	use ghtml;
43
44	sub print_general_usage {
45	my ($plugin_name) = @_;
46
47	print STDERR "\n usage: plugin $plugin_name [options]\n\n";
48
49	print STDERR " -process_exp A perl regular expression to match against filenames.\n";
50	print STDERR " Matching filenames will be processed by this plugin.\n";
51	print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
52	print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
53	print STDERR " .htm or .html (case-insensitive).\n\n";
54
55	print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
56	print STDERR " being passed to any later plugins in the list. This has no\n";
57	print STDERR " real effect other than to prevent lots of warning messages\n";
58	print STDERR " about input files you don't care about. Each plugin might\n";
59	print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
60	print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
61	print STDERR " file extensions.\n\n";
62
63
64	print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
65	print STDERR " converted from these encodings and stored internally as\n";
66	print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
67	print STDERR " are:\n";
68
69	print STDERR " auto: Use text categorization algorithm to automatically\n";
70	print STDERR " identify the encoding of each source document. This\n";
71	print STDERR " will be slower than explicitly setting the encoding\n";
72	print STDERR " but will work where more than one encoding is used\n";
73	print STDERR " within the same collection.\n";
74
75	print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
76	print STDERR " using iso_8859_1. Beware of using this on a collection\n";
77	print STDERR " of documents that may contain characters outside the\n";
78	print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
79	print STDERR " documents containing accents), use iso_8859_1 instead.\n";
80
81	print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
82	print STDERR " unicode: just unicode\n";
83
84	my $e = $encodings::encodings;
85	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
86	print STDERR " $enc: $e->{$enc}->{'name'}\n";
87	}
88	print STDERR "\n";
89	print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
90	print STDERR " the text categorization algorithm fails to extract the\n";
91	print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
92	print STDERR " The default is iso_8859_1.\n\n";
93
94	print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
95	print STDERR " metadata. Note that this will be done automatically if\n";
96	print STDERR " -input_encoding is 'auto'.\n\n";
97	print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
98	print STDERR " the 'Language' metadata element will be set to this value.\n";
99	print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
100	print STDERR " en = English). Note that if -input_encoding is not set to\n";
101	print STDERR " 'auto' and -extract_language is not set, all documents will\n";
102	print STDERR " have their 'Language' metadata set to this value.\n\n";
103
104	print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
105
106	print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
107
108	print STDERR " -first Comma separated list of first sizes to extract from the\n";
109	print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
110
111	print STDERR " -extract_email Extract email addresses as metadata\n\n";
112
113	print STDERR " -extract_date Extract dates pertaining to the content of documents about\n";
114	print STDERR " history\n";
115	print STDERR " -maximum_date The maximum historical date to be used as metadata (in a\n";
116	print STDERR " Common Era date, such as 1950)\n";
117	print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
118	print STDERR " metadata (e.g. 14 will extract all references up to the\n";
119	print STDERR " 14th century)\n";
120	print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
121	print STDERR " historical dates.\n\n";
122	}
123
124	# print_usage should be overridden for any sub-classes having
125	# their own plugin specific options
126	sub print_usage {
127	print STDERR "\nThis plugin has no plugin specific options\n\n";
128	}
129
130	sub new {
131	my $class = shift (@_);
132	my $plugin_name = shift (@_);
133	my $self = {};
134
135	my $enc = "^(";
136	map {$enc .= "$_\|";} keys %$encodings::encodings;
137	my $denc = $enc . "ascii\|utf8\|unicode)\$";
138	$enc .= "ascii\|utf8\|unicode\|auto)\$";
139
140	$self->{'outhandle'} = STDERR;
141	my $year = (localtime)[5]+1900;
142
143	$self->{'textcat'} = new textcat();
144
145	$self->{'num_processed'} = 0;
146	$self->{'num_not_processed'} = 0;
147	$self->{'num_blocked'} = 0;
148	$self->{'num_archives'} = 0;
149
150	# general options available to all plugins
151	if (!parsargv::parse(\@_,
152	q^process_exp/.*/^, \$self->{'process_exp'},
153	q^block_exp/.*/^, \$self->{'block_exp'},
154	q^extract_acronyms^, \$self->{'extract_acronyms'},
155	q^extract_keyphrases^, \$self->{'kea'}, #with extra options
156	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
157	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
158	qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'},
159	q^extract_email^, \$self->{'extract_email'},
160	q^markup_acronyms^, \$self->{'markup_acronyms'},
161	q^default_language/.{2}/en^, \$self->{'default_language'},
162	q^first/.*/^, \$self->{'first'},
163	q^extract_date^, \$self->{'date_extract'},
164	qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'},
165	q^no_bibliography^, \$self->{'no_biblio'},
166	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
167	"allow_extra_options")) {
168
169	print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
170	print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
171	&print_general_usage($plugin_name);
172	die "\n";
173	}
174
175	return bless $self, $class;
176	}
177
178	# initialize BasPlug options
179	# if init() is overridden in a sub-class, remember to call BasPlug::init()
180	sub init {
181	my $self = shift (@_);
182	my ($verbosity, $outhandle, $failhandle) = @_;
183
184	# verbosity is passed through from the processor
185	$self->{'verbosity'} = $verbosity;
186
187	# as are the outhandle and failhandle
188	$self->{'outhandle'} = $outhandle if defined $outhandle;
189	$self->{'failhandle'} = $failhandle;
190
191	# set process_exp and block_exp to defaults unless they were
192	# explicitly set
193
194	if ((!$self->is_recursive()) and
195	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
196
197	$self->{'process_exp'} = $self->get_default_process_exp ();
198	if ($self->{'process_exp'} eq "") {
199	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
200	}
201	}
202
203	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
204	$self->{'block_exp'} = $self->get_default_block_exp ();
205	}
206	}
207
208	sub begin {
209	my $self = shift (@_);
210	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
211	$self->initialise_extractors();
212	}
213
214	sub end {
215	my ($self) = @_;
216	$self->finalise_extractors();
217	}
218
219	# this function should be overridden to return 1
220	# in recursive plugins
221	sub is_recursive {
222	my $self = shift (@_);
223
224	return 0;
225	}
226
227	sub get_default_block_exp {
228	my $self = shift (@_);
229
230	return "";
231	}
232
233	sub get_default_process_exp {
234	my $self = shift (@_);
235
236	return "";
237	}
238
239	# The BasPlug read() function. This function does all the right things
240	# to make general options work for a given plugin. It calls the process()
241	# function which does all the work specific to a plugin (like the old
242	# read functions used to do). Most plugins should define their own
243	# process() function and let this read() function keep control.
244	#
245	# recursive plugins (e.g. RecPlug) and specialized plugins like those
246	# capable of processing many documents within a single file (e.g.
247	# GMLPlug) should normally implement their own version of read()
248	#
249	# Return number of files processed, undef if can't process
250	# Note that $base_dir might be "" and that $file might
251	# include directories
252
253	sub read {
254	my $self = shift (@_);
255
256	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
257
258	if ($self->is_recursive()) {
259	die "BasPlug::read function must be implemented in sub-class for recursive plugins\n";
260	}
261
262	my $outhandle = $self->{'outhandle'};
263
264	my $filename = &util::filename_cat($base_dir, $file);
265	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
266	$self->{'num_blocked'} ++;
267	return 0;
268	}
269	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
270	return undef;
271	}
272	my $plugin_name = ref ($self);
273	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
274
275	my ($language, $encoding);
276	if ($self->{'input_encoding'} eq "auto") {
277	# use textcat to automatically work out the input encoding and language
278	($language, $encoding) = $self->get_language_encoding ($filename);
279
280	} elsif ($self->{'extract_language'}) {
281	# use textcat to get language metadata
282	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
283	$encoding = $self->{'input_encoding'};
284
285	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
286	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
287	print $outhandle "appears to be encoded as $extracted_encoding.\n";
288	}
289
290	} else {
291	$language = $self->{'default_language'};
292	$encoding = $self->{'input_encoding'};
293	}
294
295	# create a new document
296	my $doc_obj = new doc ($filename, "indexed_doc");
297	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
298	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
299	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
300	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($file));
301
302	# read in file ($text will be in utf8)
303	my $text = "";
304	$self->read_file ($filename, $encoding, $language, \$text);
305
306	if (!length ($text)) {
307	print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
308
309	my $failhandle = $self->{'failhandle'};
310	print $failhandle "$file: " . ref($self) . ": file contains no text\n";
311	$self->{'num_not_processed'} ++;
312
313	return 0;
314	}
315
316	# include any metadata passed in from previous plugins
317	# note that this metadata is associated with the top level section
318	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
319
320	# do plugin specific processing of doc_obj
321	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
322
323	# do any automatic metadata extraction
324	$self->auto_extract_metadata ($doc_obj);
325
326	# add an OID
327	$doc_obj->set_OID();
328
329	# process the document
330	$processor->process($doc_obj);
331
332	$self->{'num_processed'} ++;
333
334	return 1; # processed the file
335	}
336
337	# returns undef if file is rejected by the plugin
338	sub process {
339	my $self = shift (@_);
340	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
341
342	die "Basplug::process function must be implemented in sub-class\n";
343
344	return undef; # never gets here
345	}
346
347	# uses the multiread package to read in the entire file pointed to
348	# by filename and loads the resulting text into $$textref. Input text
349	# may be in any of the encodings handled by multiread, output text
350	# will be in utf8
351	sub read_file {
352	my $self = shift (@_);
353	my ($filename, $encoding, $language, $textref) = @_;
354
355	if (!-r $filename)
356	{
357	my $outhandle = $self->{'outhandle'};
358	print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
359	return;
360	}
361
362	$$textref = "";
363
364	open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
365
366	if ($encoding eq "ascii") {
367	undef $/;
368	$$textref = <FILE>;
369	$/ = "\n";
370	} else {
371	my $reader = new multiread();
372	$reader->set_handle ('BasPlug::FILE');
373	$reader->set_encoding ($encoding);
374	$reader->read_file ($textref);
375
376	if ($language eq "zh") {
377	# segment the Chinese words
378	$$textref = &cnseg::segment($$textref);
379	}
380	}
381
382	close FILE;
383	}
384
385	# Uses textcat to work out the encoding and language of the text in
386	# $filename. All html tags are removed before processing.
387	# returns an array containing "language" and "encoding"
388	sub get_language_encoding {
389	my $self = shift (@_);
390	my ($filename) = @_;
391	my $outhandle = $self->{'outhandle'};
392
393	# read in file
394	open (FILE, $filename) \|\| die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
395	undef $/;
396	my $text = <FILE>;
397	$/ = "\n";
398	close FILE;
399
400	# remove <title>stuff</title> -- as titles tend often to be in English
401	# for foreign language documents
402	$text =~ s/<title>.*?<\/title>//i;
403
404	# remove all HTML tags
405	$text =~ s/<[^>]*>//sg;
406
407	# get the language/encoding
408	my $results = $self->{'textcat'}->classify(\$text);
409
410	# if textcat returns 3 or less possibilities we'll use the
411	# first one in the list - otherwise use the defaults
412	if (scalar @$results > 3) {
413
414	if ($self->{'input_encoding'} ne 'auto') {
415	if ($self->{'extract_language'} && $self->{'verbosity'}) {
416	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
417	print $outhandle "defaulting to $self->{'default_language'}\n";
418	}
419	return ($self->{'default_language'}, $self->{'input_encoding'});
420
421	} else {
422	if ($self->{'verbosity'}) {
423	print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - ";
424	print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n";
425	}
426	return ($self->{'default_language'}, $self->{'default_encoding'});
427	}
428	}
429
430	# format language/encoding
431	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
432	if (!defined $language) {
433	if ($self->{'verbosity'}) {
434	print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
435	print $outhandle "defaulting to $self->{'default_language'}\n";
436	}
437	$language = $self->{'default_language'};
438	}
439	if (!defined $encoding) {
440	if ($self->{'verbosity'}) {
441	print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
442	print $outhandle "defaulting to $self->{'default_encoding'}\n";
443	}
444	$encoding = $self->{'default_encoding'};
445	}
446
447	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
448	!defined $encodings::encodings->{$encoding}) {
449	if ($self->{'verbosity'}) {
450	print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
451	print $outhandle "using $self->{'default_encoding'}\n";
452	}
453	$encoding = $self->{'default_encoding'};
454	}
455
456	return ($language, $encoding);
457	}
458
459	# add any extra metadata that's been passed around from one
460	# plugin to another.
461	# extra_metadata uses add_utf8_metadata so it expects metadata values
462	# to already be in utf8
463	sub extra_metadata {
464	my $self = shift (@_);
465	my ($doc_obj, $cursection, $metadata) = @_;
466
467	foreach my $field (keys(%$metadata)) {
468	# $metadata->{$field} may be an array reference
469	if (ref ($metadata->{$field}) eq "ARRAY") {
470	map {
471	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
472	} @{$metadata->{$field}};
473	} else {
474	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
475	}
476	}
477	}
478
479	# initialise metadata extractors
480	sub initialise_extractors {
481	my $self = shift (@_);
482
483	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
484	&acronym::initialise_acronyms();
485	}
486	}
487
488	# finalise metadata extractors
489	sub finalise_extractors {
490	my $self = shift (@_);
491
492	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
493	&acronym::finalise_acronyms();
494	}
495	}
496
497	# FIRSTNNN: extract the first NNN characters as metadata
498	sub extract_first_NNNN_characters {
499	my $self = shift (@_);
500	my ($textref, $doc_obj, $thissection) = @_;
501
502	foreach my $size (split /,/, $self->{'first'}) {
503	my $tmptext = $$textref;
504	$tmptext =~ s/^\s+//;
505	$tmptext =~ s/\s+$//;
506	$tmptext =~ s/\s+/ /gs;
507	$tmptext = substr ($tmptext, 0, $size);
508	$tmptext =~ s/\s\S*$/…/;
509	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
510	}
511	}
512
513	sub extract_email {
514	my $self = shift (@_);
515	my ($textref, $doc_obj, $thissection) = @_;
516	my $outhandle = $self->{'outhandle'};
517
518	print $outhandle " extracting email addresses ...\n"
519	if ($self->{'verbosity'} > 2);
520
521	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
522	@email = sort @email;
523
524	my @email2 = ();
525	foreach my $address (@email) {
526	if (!(join(" ",@email2) =~ m/$address/ )) {
527	push @email2, $address;
528	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
529	print $outhandle " extracting $address\n"
530	if ($self->{'verbosity'} > 3);
531	}
532	}
533	print $outhandle " done extracting email addresses.\n"
534	if ($self->{'verbosity'} > 2);
535
536	}
537
538	# extract metadata
539	sub auto_extract_metadata {
540
541
542	my $self = shift (@_);
543	my ($doc_obj) = @_;
544
545	if ($self->{'extract_email'}) {
546	my $thissection = $doc_obj->get_top_section();
547	while (defined $thissection) {
548	my $text = $doc_obj->get_text($thissection);
549	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
550	$thissection = $doc_obj->get_next_section ($thissection);
551	}
552	}
553
554
555	#adding kea keyphrases
556	if ($self->{'kea'}) {
557
558	my $thissection = $doc_obj->get_top_section();
559	my $text = "";
560	my @list;
561
562	while (defined $thissection) { #loop through sections to gather whole doc
563	my $sectiontext = $doc_obj->get_text($thissection);
564	$text = $text.$sectiontext;
565	$thissection = $doc_obj->get_next_section ($thissection);
566	}
567
568	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
569	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
570	} else { #otherwise call Kea with no options
571	@list = &Kea::extract_KeyPhrases ($text);
572	}
573
574	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
575	my $keyphrases = $list[0]; #first arg is keyphrase list
576	my $stems = $list[1]; #second arg is stemmed keyphrase list
577	print STDERR "keyphrases: $keyphrases\n";
578	print STDERR "stems: $stems\n";
579	$thissection = $doc_obj->get_top_section(); #add metadata to top section
580	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
581	$doc_obj->add_metadata($thissection, "stems", $stems);
582	}
583	} #end of kea
584
585	if ($self->{'first'}) {
586	my $thissection = $doc_obj->get_top_section();
587	while (defined $thissection) {
588	my $text = $doc_obj->get_text($thissection);
589	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
590	$thissection = $doc_obj->get_next_section ($thissection);
591	}
592	}
593
594	if ($self->{'extract_acronyms'}) {
595	my $thissection = $doc_obj->get_top_section();
596	while (defined $thissection) {
597	my $text = $doc_obj->get_text($thissection);
598	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
599	$thissection = $doc_obj->get_next_section ($thissection);
600	}
601	}
602
603	if ($self->{'markup_acronyms'}) {
604	my $thissection = $doc_obj->get_top_section();
605	while (defined $thissection) {
606	my $text = $doc_obj->get_text($thissection);
607	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
608	$doc_obj->delete_text($thissection);
609	$doc_obj->add_text($thissection, $text);
610	$thissection = $doc_obj->get_next_section ($thissection);
611	}
612	}
613
614	if($self->{'date_extract'}) {
615	my $thissection = $doc_obj->get_top_section();
616	while (defined $thissection) {
617
618	my $text = $doc_obj->get_text($thissection);
619	&DateExtract::get_date_metadata($text, $doc_obj,
620	$thissection,
621	$self->{'no_biblio'},
622	$self->{'max_year'},
623	$self->{'max_century'});
624	$thissection = $doc_obj->get_next_section ($thissection);
625	}
626	}
627	}
628
629	# extract acronyms from a section in a document. progress is
630	# reported to outhandle based on the verbosity. both the Acronym
631	# and the AcronymKWIC metadata items are created.
632
633	sub extract_acronyms {
634	my $self = shift (@_);
635	my ($textref, $doc_obj, $thissection) = @_;
636	my $outhandle = $self->{'outhandle'};
637
638	print $outhandle " extracting acronyms ...\n"
639	if ($self->{'verbosity'} > 2);
640
641	my $acro_array = &acronym::acronyms($textref);
642
643	foreach my $acro (@$acro_array) {
644
645	#check that this is the first time ...
646	my $seen_before = "false";
647	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
648	foreach my $thisAcro (@$previous_data) {
649	if ($thisAcro eq $acro->to_string()) {
650	$seen_before = "true";
651	print $outhandle " already seen ". $acro->to_string() . "\n"
652	if ($self->{'verbosity'} >= 4);
653	}
654	}
655
656	if ($seen_before eq "false") {
657	#write it to the file ...
658	$acro->write_to_file();
659
660	#do the normal acronym
661	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
662	print $outhandle " adding ". $acro->to_string() . "\n"
663	if ($self->{'verbosity'} > 3);
664
665	}
666	}
667	print $outhandle " done extracting acronyms. \n"
668	if ($self->{'verbosity'} > 2);
669	}
670
671	sub markup_acronyms {
672	my $self = shift (@_);
673	my ($text, $doc_obj, $thissection) = @_;
674	my $outhandle = $self->{'outhandle'};
675
676	print $outhandle " marking up acronyms ...\n"
677	if ($self->{'verbosity'} > 2);
678
679	#self is passed in to check for verbosity ...
680	$text = &acronym::markup_acronyms($text, $self);
681
682	print $outhandle " done marking up acronyms. \n"
683	if ($self->{'verbosity'} > 2);
684
685	return $text;
686	}
687
688	sub compile_stats {
689	my $self = shift(@_);
690	my ($stats) = @_;
691
692	$stats->{'num_processed'} += $self->{'num_processed'};
693	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
694
695	}
696
697
698	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: