Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10537

Last change on this file since 10537 was 10478, checked in by kjdon, 19 years ago
arcPlug now knows about keepold, and if its not set, it wont try to do incremental building
Property svn:keywords set to `Author Date Id Revision`
File size: 42.0 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39	no strict 'subs';
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	use File::Basename;
43
44	use Kea;
45	use multiread;
46	use encodings;
47	use cnseg;
48	use acronym;
49	use textcat;
50	use doc;
51	eval "require diagnostics"; # some perl distros (eg mac) don't have this
52	use DateExtract;
53	use ghtml;
54	use gsprintf 'gsprintf';
55	use printusage;
56	use parse2;
57
58
59	use GISBasPlug;
60
61	@BasPlug::ISA = ( GISBasPlug );
62
63	my $unicode_list =
64	[ { 'name' => "ascii",
65	'desc' => "{BasPlug.input_encoding.ascii}" },
66	{ 'name' => "utf8",
67	'desc' => "{BasPlug.input_encoding.utf8}" },
68	{ 'name' => "unicode",
69	'desc' => "{BasPlug.input_encoding.unicode}" } ];
70
71	my $auto_unicode_list =
72	[ { 'name' => "auto",
73	'desc' => "{BasPlug.input_encoding.auto}" } ];
74
75	my $arguments =
76	[ { 'name' => "process_exp",
77	'desc' => "{BasPlug.process_exp}",
78	'type' => "regexp",
79	'deft' => "",
80	'reqd' => "no" },
81	{ 'name' => "block_exp",
82	'desc' => "{BasPlug.block_exp}",
83	'type' => "regexp",
84	'deft' => "",
85	'reqd' => "no" },
86	{ 'name' => "smart_block",
87	'desc' => "{BasPlug.smart_block}",
88	'type' => "flag",
89	'reqd' => "no" },
90	{ 'name' => "associate_ext",
91	'desc' => "{BasPlug.associate_ext}",
92	'type' => "string",
93	'reqd' => "no" },
94	{ 'name' => "input_encoding",
95	'desc' => "{BasPlug.input_encoding}",
96	'type' => "enum",
97	'list' => $auto_unicode_list,
98	'reqd' => "no" ,
99	'deft' => "auto" } ,
100	{ 'name' => "default_encoding",
101	'desc' => "{BasPlug.default_encoding}",
102	'type' => "enum",
103	'list' => $unicode_list,
104	'reqd' => "no",
105	'deft' => "utf8" },
106	{ 'name' => "extract_language",
107	'desc' => "{BasPlug.extract_language}",
108	'type' => "flag",
109	'reqd' => "no" },
110	{ 'name' => "default_language",
111	'desc' => "{BasPlug.default_language}",
112	'type' => "string",
113	'deft' => "en",
114	'reqd' => "no" },
115	{ 'name' => "extract_acronyms",
116	'desc' => "{BasPlug.extract_acronyms}",
117	'type' => "flag",
118	'reqd' => "no" },
119	{ 'name' => "markup_acronyms",
120	'desc' => "{BasPlug.markup_acronyms}",
121	'type' => "flag",
122	'reqd' => "no" },
123	{ 'name' => "extract_keyphrases",
124	'desc' => "{BasPlug.extract_keyphrases}",
125	'type' => "flag",
126	'reqd' => "no" },
127	{ 'name' => "extract_keyphrase_options",
128	'desc' => "{BasPlug.extract_keyphrase_options}",
129	'type' => "string",
130	'deft' => "",
131	'reqd' => "no" },
132	{ 'name' => "first",
133	'desc' => "{BasPlug.first}",
134	'type' => "string",
135	'reqd' => "no" },
136	{ 'name' => "extract_email",
137	'desc' => "{BasPlug.extract_email}",
138	'type' => "flag",
139	'reqd' => "no" },
140	{ 'name' => "extract_historical_years",
141	'desc' => "{BasPlug.extract_historical_years}",
142	'type' => "flag",
143	'reqd' => "no" },
144	{ 'name' => "maximum_year",
145	'desc' => "{BasPlug.maximum_year}",
146	'type' => "int",
147	'deft' => (localtime)[5]+1900,
148	'char_length' => "4",
149	#'range' => "2,100",
150	'reqd' => "no"},
151	{ 'name' => "maximum_century",
152	'desc' => "{BasPlug.maximum_century}",
153	'type' => "string",
154	'deft' => "-1",
155	'reqd' => "no" },
156	{ 'name' => "no_bibliography",
157	'desc' => "{BasPlug.no_bibliography}",
158	'type' => "flag",
159	'reqd' => "no"},
160	{ 'name' => "no_cover_image",
161	'desc' => "{BasPlug.no_cover_image}",
162	'type' => "flag",
163	'reqd' => "no" },
164	{ 'name' => "extract_keyphrases",
165	'desc' => "{BasPlug.extract_keyphrases}",
166	'type' => "flag",
167	'reqd' => "no",
168	'hiddengli' => "yes" },
169	{ 'name' => "extract_keyphrase_options",
170	'desc' => "{BasPlug.extract_keyphrase_options}",
171	'type' => "string",
172	'reqd' => "no",
173	'hiddengli' => "yes" },
174	{ 'name' => "separate_cjk",
175	'desc' => "{BasPlug.separate_cjk}",
176	'type' => "flag",
177	'reqd' => "no",
178	'hiddengli' => "yes" },
179	{ 'name' => "smart_block",
180	'desc' => "{BasPlug.smart_block}",
181	'type' => "flag",
182	'reqd' => "no",
183	'hiddengli' => "yes" },
184	{ 'name' => "new_extract_email",
185	'desc' => "",
186	'type' => "flag",
187	'reqd' => "no",
188	'hiddengli' => "yes" } ];
189
190	my $gis_arguments =
191	[ { 'name' => "extract_placenames",
192	'desc' => "{GISBasPlug.extract_placenames}",
193	'type' => "flag",
194	'reqd' => "no" },
195	{ 'name' => "gazetteer",
196	'desc' => "{GISBasPlug.gazetteer}",
197	'type' => "string",
198	'reqd' => "no" },
199	{ 'name' => "place_list",
200	'desc' => "{GISBasPlug.place_list}",
201	'type' => "flag",
202	'reqd' => "no" } ];
203
204
205	my $options = { 'name' => "BasPlug",
206	'desc' => "{BasPlug.desc}",
207	'abstract' => "yes",
208	'inherits' => "no",
209	'args' => $arguments };
210
211
212	sub set_keepold {
213	my $self = shift(@_);
214	my ($keepold) = @_;
215
216	$self->{'keepold'} = $keepold;
217	}
218
219	sub get_arguments
220	{
221	my $self = shift(@_);
222	my $optionlistref = $self->{'option_list'};
223	my @optionlist = @$optionlistref;
224	my $pluginoptions = pop(@$optionlistref);
225	my $pluginarguments = $pluginoptions->{'args'};
226	return $pluginarguments;
227	}
228
229
230	sub print_xml_usage
231	{
232	my $self = shift(@_);
233
234	# XML output is always in UTF-8
235	gsprintf::output_strings_in_UTF8;
236
237	PrintUsage::print_xml_header();
238	$self->print_xml();
239	}
240
241
242	sub print_xml
243	{
244	my $self = shift(@_);
245
246	my $optionlistref = $self->{'option_list'};
247	my @optionlist = @$optionlistref;
248	my $pluginoptions = shift(@$optionlistref);
249	return if (!defined($pluginoptions));
250
251	gsprintf(STDERR, "<PlugInfo>\n");
252	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
253	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
254	$desc =~ s/</&lt;/g; # doubly escaped
255	$desc =~ s/>/&gt;/g;
256
257	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
258	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
259	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
260	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
261	gsprintf(STDERR, " <Arguments>\n");
262	if (defined($pluginoptions->{'args'})) {
263	&PrintUsage::print_options_xml($pluginoptions->{'args'});
264	}
265
266	# Recurse up the plugin hierarchy
267	$self->print_xml();
268
269	gsprintf(STDERR, " </Arguments>\n");
270	gsprintf(STDERR, "</PlugInfo>\n");
271	}
272
273
274	sub print_txt_usage
275	{
276	my $self = shift(@_);
277	# Print the usage message for a plugin (recursively)
278	my $descoffset = $self->determine_description_offset(0);
279	$self->print_plugin_usage($descoffset, 1);
280	}
281
282
283	sub determine_description_offset
284	{
285	my $self = shift(@_);
286	my $maxoffset = shift(@_);
287
288	my $optionlistref = $self->{'option_list'};
289	my @optionlist = @$optionlistref;
290	my $pluginoptions = shift(@$optionlistref);
291	return $maxoffset if (!defined($pluginoptions));
292
293	# Find the length of the longest option string of this plugin
294	my $pluginargs = $pluginoptions->{'args'};
295	if (defined($pluginargs)) {
296	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
297	if ($longest > $maxoffset) {
298	$maxoffset = $longest;
299	}
300	}
301
302	# Recurse up the plugin hierarchy
303	$maxoffset = $self->determine_description_offset($maxoffset);
304	$self->{'option_list'} = \@optionlist;
305	return $maxoffset;
306	}
307
308
309	sub print_plugin_usage
310	{
311	my $self = shift(@_);
312	my $descoffset = shift(@_);
313	my $isleafclass = shift(@_);
314
315	my $optionlistref = $self->{'option_list'};
316	my @optionlist = @$optionlistref;
317	my $pluginoptions = shift(@$optionlistref);
318	return if (!defined($pluginoptions));
319
320	my $pluginname = $pluginoptions->{'name'};
321	my $pluginargs = $pluginoptions->{'args'};
322	my $plugindesc = $pluginoptions->{'desc'};
323
324	# Produce the usage information using the data structure above
325	if ($isleafclass) {
326	if (defined($plugindesc)) {
327	gsprintf(STDERR, "$plugindesc\n\n");
328	}
329	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
330	}
331
332	# Display the plugin options, if there are some
333	if (defined($pluginargs)) {
334	# Calculate the column offset of the option descriptions
335	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
336
337	if ($isleafclass) {
338	gsprintf(STDERR, " {common.specific_options}:\n");
339	}
340	else {
341	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
342	}
343
344	# Display the plugin options
345	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
346	}
347
348	# Recurse up the plugin hierarchy
349	$self->print_plugin_usage($descoffset, 0);
350	$self->{'option_list'} = \@optionlist;
351	}
352
353
354	sub new {
355	# Set Encodings to the list!!
356
357	my $e = $encodings::encodings;
358	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
359	{
360	my $hashEncode =
361	{'name' => $enc,
362	'desc' => $e->{$enc}->{'name'}};
363
364	push(@{$unicode_list},$hashEncode);
365	}
366
367	push(@{$auto_unicode_list},@{$unicode_list});
368
369	# Start the BasPlug Constructor
370	my $class = shift (@_);
371	my ($pluginlist,$args,$hashArgOptLists) = @_;
372	push(@$pluginlist, $class);
373	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
374
375	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
376	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
377
378	if (GISBasPlug::has_mapdata()) {
379	push(@$arguments,@$gis_arguments);
380	}
381
382	my $self = {};
383	if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
384	{
385	my $classTempClass = bless $self, $class;
386	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
387	$classTempClass->print_txt_usage(""); # Use default resource bundle
388	die "\n";
389	}
390
391
392	# else parsing was successful.
393
394	$self->{'plugin_type'} = $plugin_name;
395	$self->{'outhandle'} = STDERR;
396	$self->{'textcat'} = new textcat();
397	$self->{'num_processed'} = 0;
398	$self->{'num_not_processed'} = 0;
399	$self->{'num_blocked'} = 0;
400	$self->{'num_archives'} = 0;
401	$self->{'cover_image'} = 1; # cover image is on by default
402	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
403	$self->{'file_blocks'} = {};
404	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
405
406	my $associate_ext = $self->{'associate_ext'};
407	if ((defined $associate_ext) && ($associate_ext ne "")) {
408	my @exts = split(/,/,$associate_ext);
409
410	my %associate_ext_lookup = ();
411	foreach my $e (@exts) {
412	$associate_ext_lookup{$e} = 1;
413	}
414
415	$self->{'associate_ext_lookup'} = \%associate_ext_lookup;
416	}
417
418	$self->{'shared_fileroot'} = {};
419	$self->{'file_blocks'} = {};
420
421	if ($self->{'extract_placenames'}) {
422
423	my $outhandle = $self->{'outhandle'};
424
425	my $places_ref
426	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
427
428	if (!defined $places_ref) {
429	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
430	print $outhandle " No placename extraction will take place.\n";
431	$self->{'extract_placenames'} = undef;
432	}
433	else {
434	$self->{'places'} = $places_ref;
435	}
436	}
437	return bless $self, $class;
438
439	}
440
441	# initialize BasPlug options
442	# if init() is overridden in a sub-class, remember to call BasPlug::init()
443	sub init {
444	my $self = shift (@_);
445	my ($verbosity, $outhandle, $failhandle) = @_;
446
447	# verbosity is passed through from the processor
448	$self->{'verbosity'} = $verbosity;
449
450	# as are the outhandle and failhandle
451	$self->{'outhandle'} = $outhandle if defined $outhandle;
452	$self->{'failhandle'} = $failhandle;
453
454	# set process_exp and block_exp to defaults unless they were
455	# explicitly set
456
457	if ((!$self->is_recursive()) and
458	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
459
460	$self->{'process_exp'} = $self->get_default_process_exp ();
461	if ($self->{'process_exp'} eq "") {
462	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
463	}
464	}
465
466	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
467	$self->{'block_exp'} = $self->get_default_block_exp ();
468	}
469	}
470
471	sub begin {
472	my $self = shift (@_);
473	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
474	$self->initialise_extractors();
475	}
476
477	sub end {
478	# potentially called at the end of each plugin pass
479	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
480
481	my ($self) = @_;
482	$self->finalise_extractors();
483	}
484
485	sub deinit {
486	# called only once, after all plugin passes have been done
487
488	my ($self) = @_;
489	}
490
491	# this function should be overridden to return 1
492	# in recursive plugins
493	sub is_recursive {
494	my $self = shift (@_);
495
496	return 0;
497	}
498
499	sub get_default_block_exp {
500	my $self = shift (@_);
501
502	return "";
503	}
504
505	sub get_default_process_exp {
506	my $self = shift (@_);
507
508	return "";
509	}
510
511	# default implementation is to do nothing.
512	sub store_block_files
513	{
514	my $self =shift (@_);
515	my ($filename) = @_;
516	return;
517	}
518
519	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
520	sub block_cover_image
521	{
522	my $self =shift (@_);
523	my ($filename) = @_;
524	if ($self->{'cover_image'}) {
525	my $coverfile = $filename;
526	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
527	if (!-e $coverfile) {
528	$coverfile =~ s/jpg$/JPG/;
529	}
530	if (-e $coverfile) {
531	$self->{'file_blocks'}->{$coverfile} = 1;
532	}
533	}
534
535	return;
536	}
537
538	sub metadata_read {
539	my $self = shift (@_);
540	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
541	# Keep track of filenames with same root but different extensions
542	# Used to support -associate_ext
543
544	my $associate_ext = $self->{'associate_ext'};
545	if ((defined $associate_ext) && ($associate_ext ne "")) {
546
547	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
548	if ((defined $file_prefix) && (defined $file_ext)) {
549
550	my $shared_fileroot = $self->{'shared_fileroot'};
551	if (!defined $shared_fileroot->{$file_prefix}) {
552	my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
553	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
554	}
555
556	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
557
558	my $process_exp = $self->{'process_exp'};
559
560	if ($file =~ m/$self->{'process_exp'}/) {
561	# This is the document the others should be tied to
562	$file_prefix_rec->{'tie_to'} = $file_ext;
563	}
564	else {
565	if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
566	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
567	}
568	}
569	}
570	}
571
572	# now check whether we are actually processing this
573	my $filename = $file;
574	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
575	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
576	return undef; # can't recognise
577	}
578
579	# do smart blocking if appropriate
580	if (defined $self->{'smart_block'}) {
581	$self->block_cover_image($filename);
582	$self->store_block_files($filename);
583	}
584
585	return 1;
586	}
587
588	sub tie_to_filename
589	{
590	my $self = shift (@_);
591
592	my ($file_ext,$file_prefix_rec) = @_;
593
594	if (defined $file_prefix_rec) {
595	my $tie_to = $file_prefix_rec->{'tie_to'};
596
597	if (defined $tie_to) {
598	if ($tie_to eq $file_ext) {
599	return 1;
600	}
601	}
602	}
603
604	return 0;
605	}
606
607	sub tie_to_assoc_file
608	{
609	my $self = shift (@_);
610	my ($file_ext,$file_prefix_rec) = @_;
611
612	if (defined $file_prefix_rec) {
613	my $tie_to = $file_prefix_rec->{'tie_to'};
614	if (defined $tie_to) {
615
616	my $exts = $file_prefix_rec->{'exts'};
617
618	my $has_file_ext = $exts->{$file_ext};
619
620	if ($has_file_ext) {
621	return 1;
622	}
623	}
624	}
625
626	return 0;
627	}
628
629
630	sub associate_with
631	{
632	my $self = shift (@_);
633	my ($file, $filename, $metadata) = @_;
634
635	my $associate_ext = $self->{'associate_ext'};
636
637
638	return 0 if (!$associate_ext);
639
640	# If file, see if matches with "tie_to" doc or is one of the
641	# associated filename extensions.
642
643	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
644	if ((defined $file_prefix) && (defined $file_ext)) {
645
646	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
647
648	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
649
650	# Set up gsdlassocfile_tobe
651
652	my $exts = $file_prefix_rec->{'exts'};
653
654	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
655	$metadata->{'gsdlassocfile_tobe'} = [];
656	}
657
658	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
659
660	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
661	foreach my $e (keys %$exts) {
662	my $assoc_file = "$full_prefix.$e";
663	my $mime_type = ""; # let system auto detect this
664	push(@$assoc_tobe,"$assoc_file:$mime_type:");
665	}
666	}
667	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
668	# a form of smart block
669
670	return 1;
671	}
672	}
673
674	return 0;
675	}
676
677
678	sub read_block {
679	my $self = shift (@_);
680
681	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
682
683
684	my $filename = $file;
685	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
686
687	if ($self->associate_with($file,$filename,$metadata)) {
688	# a form of smart block
689	$self->{'num_blocked'} ++;
690	return (0,undef); # blocked
691	}
692
693	my $smart_block = $self->{'smart_block'};
694	my $smart_block_BN = $self->{'smart_block_BN'};
695
696	if ($smart_block \|\| $smart_block_BN) {
697	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
698	$self->{'num_blocked'} ++;
699	return (0,undef); # blocked
700	}
701	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
702	$self->{'num_blocked'} ++;
703	return (0,undef); # blocked
704	}
705
706	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
707	return (undef,undef); # can't recognise
708	}
709
710	return (1,$filename);
711	}
712
713	sub read_tidy_file {
714
715	my $self = shift (@_);
716
717	my ($file) = @_;
718
719	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
720
721	return $file;
722	}
723
724
725
726	# The BasPlug read_into_doc_obj() function. This function does all the
727	# right things to make general options work for a given plugin. It reads in
728	# a file and sets up a slew of metadata all saved in doc_obj, which
729	# it then returns as part of a tuple (process_status,doc_obj)
730	#
731	# Much of this functionality used to reside in read, but it was broken
732	# down into a supporting routine to make the code more flexible.
733	#
734	# recursive plugins (e.g. RecPlug) and specialized plugins like those
735	# capable of processing many documents within a single file (e.g.
736	# GMLPlug) will normally want to implement their own version of
737	# read_into_doc_obj()
738	#
739	# Note that $base_dir might be "" and that $file might
740	# include directories
741	sub read_into_doc_obj {
742	my $self = shift (@_);
743	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
744
745	if ($self->is_recursive()) {
746	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
747	}
748
749	my $outhandle = $self->{'outhandle'};
750
751	my ($block_status,$filename) = $self->read_block(@_);
752	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
753	$file = $self->read_tidy_file($file);
754
755	# Do encoding stuff
756	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
757
758	# create a new document
759	my $doc_obj = new doc ($filename, "indexed_doc");
760	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
761	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
762	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
763	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
764	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
765
766	my ($filemeta) = $file =~ /([^\\\/]+)$/;
767	# how do we know what encoding the filename is in?
768	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
769	if ($self->{'cover_image'}) {
770	$self->associate_cover_image($doc_obj, $filename);
771	}
772
773	# read in file ($text will be in utf8)
774	my $text = "";
775	$self->read_file ($filename, $encoding, $language, \$text);
776
777	if (!length ($text)) {
778	my $plugin_name = ref ($self);
779	if ($gli) {
780	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
781	}
782	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
783
784	my $failhandle = $self->{'failhandle'};
785	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
786	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
787	$self->{'num_not_processed'} ++;
788
789	return (0,undef); # what should we return here?? error but don't want to pass it on
790	}
791
792	# include any metadata passed in from previous plugins
793	# note that this metadata is associated with the top level section
794
795	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
796
797	# do plugin specific processing of doc_obj
798	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
799	$text = '';
800	undef $text;
801	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
802	return (-1,undef);
803	}
804	$text='';
805	undef $text;
806
807	# do any automatic metadata extraction
808	$self->auto_extract_metadata ($doc_obj);
809
810	# add an OID
811	# see if there is a plugin-specific set_OID function...
812	if (defined ($self->can('set_OID'))) {
813	# it will need $doc_obj to set the Identifier metadata...
814	$self->set_OID($doc_obj);
815	} else {
816	# use the default set_OID() in doc.pm
817	$doc_obj->set_OID();
818	}
819
820	return (1,$doc_obj);
821	}
822
823
824	# The BasPlug read() function. This function calls read_into_doc_obj()
825	# to ensure all the right things to make general options work for a
826	# given plugin are done. It then calls the process() function which
827	# does all the work specific to a plugin (like the old read functions
828	# used to do). Most plugins should define their own process() function
829	# and let this read() function keep control.
830	#
831	# recursive plugins (e.g. RecPlug) and specialized plugins like those
832	# capable of processing many documents within a single file (e.g.
833	# GMLPlug) might want to implement their own version of read(), but
834	# more likely need to implement their own version of read_into_doc_obj()
835	#
836	# Return number of files processed, undef if can't recognise, -1 if can't
837	# process
838
839	sub read {
840	my $self = shift (@_);
841	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
842
843	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
844
845	if ((defined $process_status) && ($process_status == 1)) {
846	# process the document
847	$processor->process($doc_obj);
848
849	if(defined($self->{'places_filename'})){
850	&util::rm($self->{'places_filename'});
851	$self->{'places_filename'} = undef;
852	}
853
854	$self->{'num_processed'} ++;
855	undef $doc_obj;
856	}
857
858	# if process_status == 1, then the file has been processed.
859	return $process_status;
860
861	}
862
863	# returns undef if file is rejected by the plugin
864	sub process {
865	my $self = shift (@_);
866	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
867
868	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
869	# die "Basplug::process function must be implemented in sub-class\n";
870
871	return undef; # never gets here
872	}
873
874	# uses the multiread package to read in the entire file pointed to
875	# by filename and loads the resulting text into $$textref. Input text
876	# may be in any of the encodings handled by multiread, output text
877	# will be in utf8
878	sub read_file {
879	my $self = shift (@_);
880	my ($filename, $encoding, $language, $textref) = @_;
881
882	if (!-r $filename)
883	{
884	my $outhandle = $self->{'outhandle'};
885	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
886	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
887	return;
888	}
889	$$textref = "";
890	if (!open (FILE, $filename)) {
891	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
892	die "\n";
893	}
894
895	if ($encoding eq "ascii") {
896	undef $/;
897	$$textref = <FILE>;
898	$/ = "\n";
899	} else {
900	my $reader = new multiread();
901	$reader->set_handle ('BasPlug::FILE');
902	$reader->set_encoding ($encoding);
903	$reader->read_file ($textref);
904	#Now segments chinese if the separate_cjk option is set
905	if ($self->{'separate_cjk'}) {
906	# segment the Chinese words
907	$$textref = &cnseg::segment($$textref);
908	}
909	}
910	close FILE;
911	}
912
913	# write_file -- used by ConvertToPlug, for example in post processing
914	#
915	sub utf8_write_file {
916	my $self = shift (@_);
917	my ($textref, $filename) = @_;
918
919	if (!open (FILE, ">$filename")) {
920	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
921	die "\n";
922	}
923	print FILE $$textref;
924
925	close FILE;
926	}
927
928
929	sub filename_based_title
930	{
931	my $self = shift (@_);
932	my ($file) = @_;
933
934	my $file_derived_title = $file;
935	$file_derived_title =~ s/_/ /g;
936	$file_derived_title =~ s/\..*?$//;
937
938	return $file_derived_title;
939	}
940
941
942	sub title_fallback
943	{
944	my $self = shift (@_);
945	my ($doc_obj,$section,$file) = @_;
946
947	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
948
949	my $file_derived_title = $self->filename_based_title($file);
950	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
951	}
952	}
953
954	sub textcat_get_language_encoding {
955	my $self = shift (@_);
956	my ($filename) = @_;
957
958
959	my ($language, $encoding, $extracted_encoding);
960	if ($self->{'input_encoding'} eq "auto") {
961	# use textcat to automatically work out the input encoding and language
962	($language, $encoding) = $self->get_language_encoding ($filename);
963	} elsif ($self->{'extract_language'}) {
964	# use textcat to get language metadata
965	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
966	$encoding = $self->{'input_encoding'};
967	# don't print this message for english... english in utf8 is identical
968	# to english in iso-8859-1 (except for some punctuation). We don't have
969	# a language model for en_utf8, so textcat always says iso-8859-1!
970	if ($extracted_encoding ne $encoding && $language ne "en"
971	&& $self->{'verbosity'}) {
972	my $plugin_name = ref ($self);
973	my $outhandle = $self->{'outhandle'};
974	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
975	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
976	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
977	}
978	} else {
979	$language = $self->{'default_language'};
980	$encoding = $self->{'input_encoding'};
981	}
982
983	return ($language, $encoding);
984	}
985
986	# Uses textcat to work out the encoding and language of the text in
987	# $filename. All html tags are removed before processing.
988	# returns an array containing "language" and "encoding"
989	sub get_language_encoding {
990	my $self = shift (@_);
991	my ($filename) = @_;
992	my $outhandle = $self->{'outhandle'};
993	my $unicode_format = "";
994	# read in file
995	open (FILE, $filename) \|\| (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
996	undef $/;
997	my $text = <FILE>;
998	$/ = "\n";
999	close FILE;
1000
1001	# check if first few bytes have a Byte Order Marker
1002	my $bom=substr($text,0,2); # check 16bit unicode
1003	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1004	$unicode_format="unicode";
1005	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1006	$unicode_format="unicode";
1007	} else {
1008	$bom=substr($text,0,3); # check utf-8
1009	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1010	$unicode_format="utf8";
1011	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1012	# $unicode_format="utf8";
1013	}
1014	}
1015
1016	# VB scripting generated Word to HTML file
1017	if ($text =~ /charset=(windows.*)[\"]/ig){
1018	my $vbhtml_encoding = $1;
1019	$vbhtml_encoding =~ s/-+/_/g;
1020	$self->{'input_encoding'} = $vbhtml_encoding;
1021	}
1022
1023	# remove <title>stuff</title> -- as titles tend often to be in English
1024	# for foreign language documents
1025	$text =~ s/<title>(.\|\n)*?<\/title>//i;
1026
1027	# remove all HTML tags
1028	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1029	if (ref($self) eq 'HTMLPlug' \|\|
1030	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1031	$text =~ s/<[^>]*>//sg;
1032	}
1033
1034	# get the language/encoding
1035	my $results = $self->{'textcat'}->classify(\$text);
1036
1037	# if textcat returns 3 or less possibilities we'll use the
1038	# first one in the list - otherwise use the defaults
1039	if (scalar @$results > 3) {
1040	my $best_encoding="";
1041	if ($unicode_format) { # in case the first had a BOM
1042	$best_encoding=$unicode_format;
1043	} else {
1044	my %guessed_encodings = ();
1045	foreach my $result (@$results) {
1046	$result =~ /([^\-]+)$/;
1047	my $enc=$1;
1048	if (!defined($guessed_encodings{$enc})) {
1049	$guessed_encodings{$enc}=0;
1050	}
1051	$guessed_encodings{$enc}++;
1052	}
1053
1054	$guessed_encodings{""}=-1; # for default best_encoding of ""
1055	foreach my $enc (keys %guessed_encodings) {
1056	if ($guessed_encodings{$enc} >
1057	$guessed_encodings{$best_encoding}){
1058	$best_encoding=$enc;
1059	}
1060	}
1061	}
1062
1063	if ($self->{'input_encoding'} ne 'auto') {
1064	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1065	gsprintf($outhandle,
1066	"BasPlug: {BasPlug.could_not_extract_language}\n",
1067	$filename, $self->{'default_language'});
1068	}
1069	return ($self->{'default_language'}, $self->{'input_encoding'});
1070
1071	} else {
1072	if ($self->{'verbosity'}>2) {
1073	gsprintf($outhandle,
1074	"BasPlug: {BasPlug.could_not_extract_language}\n",
1075	$filename, $self->{'default_language'});
1076	}
1077	return ($self->{'default_language'}, $best_encoding);
1078	}
1079	}
1080
1081	# format language/encoding
1082	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
1083	if (!defined $language) {
1084	if ($self->{'verbosity'}>2) {
1085	gsprintf($outhandle,
1086	"BasPlug: {BasPlug.could_not_extract_language}\n",
1087	$filename, $self->{'default_language'});
1088	}
1089	$language = $self->{'default_language'};
1090	}
1091	if (!defined $encoding) {
1092	if ($self->{'verbosity'}>2) {
1093	gsprintf($outhandle,
1094	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
1095	$filename, $self->{'default_encoding'});
1096	}
1097	$encoding = $self->{'default_encoding'};
1098	}
1099
1100
1101	# check for equivalents where textcat doesn't have some encodings...
1102	# eg MS versions of standard encodings
1103	if ($encoding =~ /^iso_8859_(\d+)/) {
1104	my $iso = $1; # which variant of the iso standard?
1105	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1106	if ($text =~ /[\x80-\x9f]/) {
1107	# Western Europe
1108	if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1109	elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1110	elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1111	elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1112	elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1113	elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1114	elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1115	}
1116	}
1117
1118	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1119	!defined $encodings::encodings->{$encoding}) {
1120	if ($self->{'verbosity'}) {
1121	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1122	$filename, $encoding, $self->{'default_encoding'});
1123	}
1124	$encoding = $self->{'default_encoding'};
1125	}
1126
1127	return ($language, $encoding);
1128	}
1129
1130	# add any extra metadata that's been passed around from one
1131	# plugin to another.
1132	# extra_metadata uses add_utf8_metadata so it expects metadata values
1133	# to already be in utf8
1134	sub extra_metadata {
1135	my $self = shift (@_);
1136	my ($doc_obj, $cursection, $metadata) = @_;
1137
1138	foreach my $field (keys(%$metadata)) {
1139	# $metadata->{$field} may be an array reference
1140	if ($field eq "gsdlassocfile_tobe") {
1141	# 'gsdlassocfile_tobe' is artificially introduced metadata
1142	# that is used to signal that certain additional files should
1143	# be tied to this document. Useful in situations where a
1144	# metadata pass in the plugin pipeline works out some files
1145	# need to be associated with a document, but the document hasn't
1146	# been formed yet.
1147
1148	my $equiv_form = "";
1149	foreach my $gaf (@{$metadata->{$field}}) {
1150	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1151	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1152	my $filename = $full_filename;
1153
1154	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1155
1156	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1157	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1158	my $srcicon = "_icon".$doc_ext."_";
1159	my $end_doclink = "</a>";
1160
1161	$equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1162	}
1163	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1164	}
1165	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1166	map {
1167	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1168	} @{$metadata->{$field}};
1169	} else {
1170	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1171	}
1172	}
1173	}
1174
1175	# initialise metadata extractors
1176	sub initialise_extractors {
1177	my $self = shift (@_);
1178
1179	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1180	&acronym::initialise_acronyms();
1181	}
1182	}
1183
1184	# finalise metadata extractors
1185	sub finalise_extractors {
1186	my $self = shift (@_);
1187
1188	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1189	&acronym::finalise_acronyms();
1190	}
1191	}
1192
1193	# FIRSTNNN: extract the first NNN characters as metadata
1194	sub extract_first_NNNN_characters {
1195	my $self = shift (@_);
1196	my ($textref, $doc_obj, $thissection) = @_;
1197
1198	foreach my $size (split /,/, $self->{'first'}) {
1199	my $tmptext = $$textref;
1200	$tmptext =~ s/^\s+//;
1201	$tmptext =~ s/\s+$//;
1202	$tmptext =~ s/\s+/ /gs;
1203	$tmptext = substr ($tmptext, 0, $size);
1204	$tmptext =~ s/\s\S*$/…/;
1205	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1206	}
1207	}
1208
1209	sub extract_email {
1210	my $self = shift (@_);
1211	my ($textref, $doc_obj, $thissection) = @_;
1212	my $outhandle = $self->{'outhandle'};
1213
1214	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1215	if ($self->{'verbosity'} > 2);
1216
1217	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1218	@email = sort @email;
1219
1220	# if($self->{"new_extract_email"} == 0)
1221	# {
1222	# my @email2 = ();
1223	# foreach my $address (@email)
1224	# {
1225	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
1226	# {
1227	# push @email2, $address;
1228	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1229	# # print $outhandle " extracting $address\n"
1230	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1231	# if ($self->{'verbosity'} > 3);
1232	# }
1233	# }
1234	# }
1235	# else
1236	# {
1237	my $hashExistMail = {};
1238	foreach my $address (@email) {
1239	if (!(defined $hashExistMail->{$address}))
1240	{
1241	$hashExistMail->{$address} = 1;
1242	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1243	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1244	if ($self->{'verbosity'} > 3);
1245	}
1246	}
1247	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1248	if ($self->{'verbosity'} > 2);
1249	}
1250
1251	# extract metadata
1252	sub auto_extract_metadata {
1253
1254	my $self = shift (@_);
1255	my ($doc_obj) = @_;
1256
1257	if ($self->{'extract_email'}) {
1258	my $thissection = $doc_obj->get_top_section();
1259	while (defined $thissection) {
1260	my $text = $doc_obj->get_text($thissection);
1261	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1262	$thissection = $doc_obj->get_next_section ($thissection);
1263	}
1264	}
1265	if ($self->{'extract_placenames'}) {
1266	my $thissection = $doc_obj->get_top_section();
1267	while (defined $thissection) {
1268	my $text = $doc_obj->get_text($thissection);
1269	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1270	$thissection = $doc_obj->get_next_section ($thissection);
1271	}
1272	}
1273
1274	#adding kea keyphrases
1275	if ($self->{'extract_keyphrases'}) {
1276
1277	my $thissection = $doc_obj->get_top_section();
1278	my $text = "";
1279	my $list;
1280
1281	#loop through sections to gather whole doc
1282	while (defined $thissection) {
1283	my $sectiontext = $doc_obj->get_text($thissection);
1284	$text = $text.$sectiontext;
1285	$thissection = $doc_obj->get_next_section ($thissection);
1286	}
1287
1288	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1289	$list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
1290	} else { #otherwise call Kea with no options
1291	$list = &Kea::extract_KeyPhrases ($text);
1292	}
1293
1294	if ($list){
1295	# if a list of kea keyphrases was returned (ie not empty)
1296	if ($self->{'verbosity'}) {
1297	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1298	}
1299
1300	#add metadata to top section
1301	$thissection = $doc_obj->get_top_section();
1302
1303	# add all key phrases as one metadata
1304	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1305
1306	# add individual key phrases as multiple metadata
1307	foreach my $keyphrase (split(',', $list)) {
1308	$keyphrase =~ s/^\s+\|\s+$//g;
1309	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1310	}
1311	}
1312	} #end of kea
1313
1314	if ($self->{'first'}) {
1315	my $thissection = $doc_obj->get_top_section();
1316	while (defined $thissection) {
1317	my $text = $doc_obj->get_text($thissection);
1318	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1319	$thissection = $doc_obj->get_next_section ($thissection);
1320	}
1321	}
1322
1323	if ($self->{'extract_acronyms'}) {
1324	my $thissection = $doc_obj->get_top_section();
1325	while (defined $thissection) {
1326	my $text = $doc_obj->get_text($thissection);
1327	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1328	$thissection = $doc_obj->get_next_section ($thissection);
1329	}
1330	}
1331
1332	if ($self->{'markup_acronyms'}) {
1333	my $thissection = $doc_obj->get_top_section();
1334	while (defined $thissection) {
1335	my $text = $doc_obj->get_text($thissection);
1336	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1337	$doc_obj->delete_text($thissection);
1338	$doc_obj->add_text($thissection, $text);
1339	$thissection = $doc_obj->get_next_section ($thissection);
1340	}
1341	}
1342
1343	if($self->{'extract_historical_years'}) {
1344	my $thissection = $doc_obj->get_top_section();
1345	while (defined $thissection) {
1346
1347	my $text = $doc_obj->get_text($thissection);
1348	&DateExtract::get_date_metadata($text, $doc_obj,
1349	$thissection,
1350	$self->{'no_bibliography'},
1351	$self->{'maximum_year'},
1352	$self->{'maximum_century'});
1353	$thissection = $doc_obj->get_next_section ($thissection);
1354	}
1355	}
1356	}
1357
1358	# extract acronyms from a section in a document. progress is
1359	# reported to outhandle based on the verbosity. both the Acronym
1360	# and the AcronymKWIC metadata items are created.
1361
1362	sub extract_acronyms {
1363	my $self = shift (@_);
1364	my ($textref, $doc_obj, $thissection) = @_;
1365	my $outhandle = $self->{'outhandle'};
1366
1367	# print $outhandle " extracting acronyms ...\n"
1368	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1369	if ($self->{'verbosity'} > 2);
1370
1371	my $acro_array = &acronym::acronyms($textref);
1372
1373	foreach my $acro (@$acro_array) {
1374
1375	#check that this is the first time ...
1376	my $seen_before = "false";
1377	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1378	foreach my $thisAcro (@$previous_data) {
1379	if ($thisAcro eq $acro->to_string()) {
1380	$seen_before = "true";
1381	if ($self->{'verbosity'} >= 4) {
1382	gsprintf($outhandle, " {BasPlug.already_seen} " .
1383	$acro->to_string() . "\n");
1384	}
1385	}
1386	}
1387
1388	if ($seen_before eq "false") {
1389	#write it to the file ...
1390	$acro->write_to_file();
1391
1392	#do the normal acronym
1393	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1394	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1395	if ($self->{'verbosity'} > 3);
1396	}
1397	}
1398
1399	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1400	if ($self->{'verbosity'} > 2);
1401	}
1402
1403	sub markup_acronyms {
1404	my $self = shift (@_);
1405	my ($text, $doc_obj, $thissection) = @_;
1406	my $outhandle = $self->{'outhandle'};
1407
1408	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1409	if ($self->{'verbosity'} > 2);
1410
1411	#self is passed in to check for verbosity ...
1412	$text = &acronym::markup_acronyms($text, $self);
1413
1414	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1415	if ($self->{'verbosity'} > 2);
1416
1417	return $text;
1418	}
1419
1420	sub compile_stats {
1421	my $self = shift(@_);
1422	my ($stats) = @_;
1423
1424	$stats->{'num_processed'} += $self->{'num_processed'};
1425	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1426	$stats->{'num_archives'} += $self->{'num_archives'};
1427
1428	}
1429
1430	sub associate_cover_image {
1431	my $self = shift(@_);
1432	my ($doc_obj, $filename) = @_;
1433
1434	my $top_section=$doc_obj->get_top_section();
1435
1436	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1437	if (-e $filename) {
1438	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1439	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1440	} else {
1441	$filename =~ s/jpg$/JPG/;
1442	if (-e $filename) {
1443	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1444	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1445	}
1446	}
1447	}
1448
1449	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: