Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10276

Last change on this file since 10276 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 39.8 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39	no strict 'subs';
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	use File::Basename;
43
44	use Kea;
45	use parsargv;
46	use multiread;
47	use encodings;
48	use cnseg;
49	use acronym;
50	use textcat;
51	use doc;
52	eval "require diagnostics"; # some perl distros (eg mac) don't have this
53	use DateExtract;
54	use ghtml;
55	use gsprintf 'gsprintf';
56	use printusage;
57	#$%^
58	use parse2;
59
60
61	use GISBasPlug;
62
63	@BasPlug::ISA = ( GISBasPlug );
64
65	my $unicode_list =
66	[ { 'name' => "ascii",
67	'desc' => "{BasPlug.input_encoding.ascii}" },
68	{ 'name' => "utf8",
69	'desc' => "{BasPlug.input_encoding.utf8}" },
70	{ 'name' => "unicode",
71	'desc' => "{BasPlug.input_encoding.unicode}" } ];
72
73	my $auto_unicode_list =
74	[ { 'name' => "auto",
75	'desc' => "{BasPlug.input_encoding.auto}" } ];
76
77	my $arguments =
78	[ { 'name' => "process_exp",
79	'desc' => "{BasPlug.process_exp}",
80	'type' => "regexp",
81	'deft' => "",
82	'reqd' => "no" },
83	{ 'name' => "block_exp",
84	'desc' => "{BasPlug.block_exp}",
85	'type' => "regexp",
86	'deft' => "",
87	'reqd' => "no" },
88	{ 'name' => "smart_block",
89	'desc' => "{BasPlug.smart_block}",
90	'type' => "flag",
91	'reqd' => "no" },
92	{ 'name' => "associate_ext",
93	'desc' => "{BasPlug.associate_ext}",
94	'type' => "string",
95	'reqd' => "no" },
96	{ 'name' => "input_encoding",
97	'desc' => "{BasPlug.input_encoding}",
98	'type' => "enum",
99	'list' => $auto_unicode_list,
100	'reqd' => "no" ,
101	'deft' => "auto" } ,
102	{ 'name' => "default_encoding",
103	'desc' => "{BasPlug.default_encoding}",
104	'type' => "enum",
105	'list' => $unicode_list,
106	'reqd' => "no",
107	'deft' => "utf8" },
108	{ 'name' => "extract_language",
109	'desc' => "{BasPlug.extract_language}",
110	'type' => "flag",
111	'reqd' => "no" },
112	{ 'name' => "default_language",
113	'desc' => "{BasPlug.default_language}",
114	'type' => "language",
115	'deft' => "en",
116	'char_length' => "2",
117	'reqd' => "no" },
118	{ 'name' => "extract_acronyms",
119	'desc' => "{BasPlug.extract_acronyms}",
120	'type' => "flag",
121	'reqd' => "no" },
122	{ 'name' => "markup_acronyms",
123	'desc' => "{BasPlug.markup_acronyms}",
124	'type' => "flag",
125	'reqd' => "no" },
126	{ 'name' => "extract_keyphrases",
127	'desc' => "{BasPlug.extract_keyphrases}",
128	'type' => "flag",
129	'reqd' => "no" },
130	{ 'name' => "extract_keyphrase_options",
131	'desc' => "{BasPlug.extract_keyphrase_options}",
132	'type' => "string",
133	'deft' => "",
134	'reqd' => "no" },
135	{ 'name' => "first",
136	'desc' => "{BasPlug.first}",
137	'type' => "string",
138	'reqd' => "no" },
139	{ 'name' => "extract_email",
140	'desc' => "{BasPlug.extract_email}",
141	'type' => "flag",
142	'reqd' => "no" },
143	{ 'name' => "extract_historical_years",
144	'desc' => "{BasPlug.extract_historical_years}",
145	'type' => "flag",
146	'reqd' => "no" },
147	{ 'name' => "maximum_year",
148	'desc' => "{BasPlug.maximum_year}",
149	'type' => "int",
150	'deft' => (localtime)[5]+1900,
151	'char_length' => "4",
152	#'range' => "2,100",
153	'reqd' => "no"},
154	{ 'name' => "maximum_century",
155	'desc' => "{BasPlug.maximum_century}",
156	'type' => "string",
157	'deft' => "-1",
158	'reqd' => "no" },
159	{ 'name' => "no_bibliography",
160	'desc' => "{BasPlug.no_bibliography}",
161	'type' => "flag",
162	'reqd' => "no"},
163	{ 'name' => "no_cover_image",
164	'desc' => "{BasPlug.no_cover_image}",
165	'type' => "flag",
166	'reqd' => "no" },
167	{ 'name' => "extract_keyphrases",
168	'desc' => "{BasPlug.extract_keyphrases}",
169	'type' => "flag",
170	'reqd' => "no",
171	'hiddengli' => "yes" },
172	{ 'name' => "extract_keyphrase_options",
173	'desc' => "{BasPlug.extract_keyphrase_options}",
174	'type' => "string",
175	'reqd' => "no",
176	'hiddengli' => "yes" },
177	{ 'name' => "separate_cjk",
178	'desc' => "{BasPlug.separate_cjk}",
179	'type' => "flag",
180	'reqd' => "no",
181	'hiddengli' => "yes" },
182	{ 'name' => "smart_block",
183	'desc' => "{BasPlug.smart_block}",
184	'type' => "flag",
185	'reqd' => "no",
186	'hiddengli' => "yes" },
187	{ 'name' => "new_extract_email",
188	'desc' => "",
189	'type' => "flag",
190	'reqd' => "no",
191	'hiddengli' => "yes" } ];
192
193	my $gis_arguments =
194	[ { 'name' => "extract_placenames",
195	'desc' => "{GISBasPlug.extract_placenames}",
196	'type' => "flag",
197	'reqd' => "no" },
198	{ 'name' => "gazetteer",
199	'desc' => "{GISBasPlug.gazetteer}",
200	'type' => "string",
201	'reqd' => "no" },
202	{ 'name' => "place_list",
203	'desc' => "{GISBasPlug.place_list}",
204	'type' => "flag",
205	'reqd' => "no" } ];
206
207
208	my $options = { 'name' => "BasPlug",
209	'desc' => "{BasPlug.desc}",
210	'abstract' => "yes",
211	'inherits' => "no",
212	'args' => $arguments };
213
214
215
216
217	sub get_arguments
218	{
219	my $self = shift(@_);
220	my $optionlistref = $self->{'option_list'};
221	my @optionlist = @$optionlistref;
222	my $pluginoptions = pop(@$optionlistref);
223	my $pluginarguments = $pluginoptions->{'args'};
224	return $pluginarguments;
225	}
226
227
228	sub print_xml_usage
229	{
230	my $self = shift(@_);
231
232	# XML output is always in UTF-8
233	gsprintf::output_strings_in_UTF8;
234
235	PrintUsage::print_xml_header();
236	$self->print_xml();
237	}
238
239
240	sub print_xml
241	{
242	my $self = shift(@_);
243
244	my $optionlistref = $self->{'option_list'};
245	my @optionlist = @$optionlistref;
246	my $pluginoptions = shift(@$optionlistref);
247	return if (!defined($pluginoptions));
248
249	gsprintf(STDERR, "<PlugInfo>\n");
250	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
251	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
252	$desc =~ s/</&lt;/g; # doubly escaped
253	$desc =~ s/>/&gt;/g;
254
255	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
256	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
257	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
258	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
259	gsprintf(STDERR, " <Arguments>\n");
260	if (defined($pluginoptions->{'args'})) {
261	&PrintUsage::print_options_xml($pluginoptions->{'args'});
262	}
263
264	# Recurse up the plugin hierarchy
265	$self->print_xml();
266
267	gsprintf(STDERR, " </Arguments>\n");
268	gsprintf(STDERR, "</PlugInfo>\n");
269	}
270
271
272	sub print_txt_usage
273	{
274	my $self = shift(@_);
275	# Print the usage message for a plugin (recursively)
276	my $descoffset = $self->determine_description_offset(0);
277	$self->print_plugin_usage($descoffset, 1);
278	}
279
280
281	sub determine_description_offset
282	{
283	my $self = shift(@_);
284	my $maxoffset = shift(@_);
285
286	my $optionlistref = $self->{'option_list'};
287	my @optionlist = @$optionlistref;
288	my $pluginoptions = shift(@$optionlistref);
289	return $maxoffset if (!defined($pluginoptions));
290
291	# Find the length of the longest option string of this plugin
292	my $pluginargs = $pluginoptions->{'args'};
293	if (defined($pluginargs)) {
294	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
295	if ($longest > $maxoffset) {
296	$maxoffset = $longest;
297	}
298	}
299
300	# Recurse up the plugin hierarchy
301	$maxoffset = $self->determine_description_offset($maxoffset);
302	$self->{'option_list'} = \@optionlist;
303	return $maxoffset;
304	}
305
306
307	sub print_plugin_usage
308	{
309	my $self = shift(@_);
310	my $descoffset = shift(@_);
311	my $isleafclass = shift(@_);
312
313	my $optionlistref = $self->{'option_list'};
314	my @optionlist = @$optionlistref;
315	my $pluginoptions = shift(@$optionlistref);
316	return if (!defined($pluginoptions));
317
318	my $pluginname = $pluginoptions->{'name'};
319	my $pluginargs = $pluginoptions->{'args'};
320	my $plugindesc = $pluginoptions->{'desc'};
321
322	# Produce the usage information using the data structure above
323	if ($isleafclass) {
324	if (defined($plugindesc)) {
325	gsprintf(STDERR, "$plugindesc\n\n");
326	}
327	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
328	}
329
330	# Display the plugin options, if there are some
331	if (defined($pluginargs)) {
332	# Calculate the column offset of the option descriptions
333	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
334
335	if ($isleafclass) {
336	gsprintf(STDERR, " {common.specific_options}:\n");
337	}
338	else {
339	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
340	}
341
342	# Display the plugin options
343	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
344	}
345
346	# Recurse up the plugin hierarchy
347	$self->print_plugin_usage($descoffset, 0);
348	$self->{'option_list'} = \@optionlist;
349	}
350
351
352	sub new {
353	# Set Encodings to the list!!
354
355	my $e = $encodings::encodings;
356	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
357	{
358	my $hashEncode =
359	{'name' => $enc,
360	'desc' => $e->{$enc}->{'name'}};
361
362	push(@{$unicode_list},$hashEncode);
363	}
364
365	push(@{$auto_unicode_list},@{$unicode_list});
366
367	# Start the BasPlug Constructor
368	my $class = shift (@_);
369	my ($pluginlist,$args,$hashArgOptLists) = @_;
370	push(@$pluginlist, $class);
371	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
372
373	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
374	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
375
376	if (GISBasPlug::has_mapdata()) {
377	push(@$arguments,@$gis_arguments);
378	}
379
380	my $self = {};
381	if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
382	{
383	my $classTempClass = bless $self, $class;
384	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
385	$classTempClass->print_txt_usage(""); # Use default resource bundle
386	die "\n";
387	}
388
389	# else parsing was successful.
390
391	$self->{'plugin_type'} = $plugin_name;
392	$self->{'outhandle'} = STDERR;
393	$self->{'textcat'} = new textcat();
394	$self->{'num_processed'} = 0;
395	$self->{'num_not_processed'} = 0;
396	$self->{'num_blocked'} = 0;
397	$self->{'num_archives'} = 0;
398	$self->{'cover_image'} = 1; # cover image is on by default
399	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
400	$self->{'file_blocks'} = {};
401	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
402
403	my $associate_ext = $self->{'associate_ext'};
404	if ((defined $associate_ext) && ($associate_ext ne "")) {
405	my @exts = split(/,/,$associate_ext);
406
407	my %associate_ext_lookup = ();
408	foreach my $e (@exts) {
409	$associate_ext_lookup{$e} = 1;
410	}
411
412	$self->{'associate_ext_lookup'} = \%associate_ext_lookup;
413	}
414
415	$self->{'shared_fileroot'} = {};
416	$self->{'file_blocks'} = {};
417
418	if ($self->{'extract_placenames'}) {
419
420	my $outhandle = $self->{'outhandle'};
421
422	my $places_ref
423	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
424
425	if (!defined $places_ref) {
426	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
427	print $outhandle " No placename extraction will take place.\n";
428	$self->{'extract_placenames'} = undef;
429	}
430	else {
431	$self->{'places'} = $places_ref;
432	}
433	}
434	return bless $self, $class;
435
436	}
437
438	# initialize BasPlug options
439	# if init() is overridden in a sub-class, remember to call BasPlug::init()
440	sub init {
441	my $self = shift (@_);
442	my ($verbosity, $outhandle, $failhandle) = @_;
443
444	# verbosity is passed through from the processor
445	$self->{'verbosity'} = $verbosity;
446
447	# as are the outhandle and failhandle
448	$self->{'outhandle'} = $outhandle if defined $outhandle;
449	$self->{'failhandle'} = $failhandle;
450
451	# set process_exp and block_exp to defaults unless they were
452	# explicitly set
453
454	if ((!$self->is_recursive()) and
455	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
456
457	$self->{'process_exp'} = $self->get_default_process_exp ();
458	if ($self->{'process_exp'} eq "") {
459	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
460	}
461	}
462
463	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
464	$self->{'block_exp'} = $self->get_default_block_exp ();
465	}
466	}
467
468	sub begin {
469	my $self = shift (@_);
470	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
471	$self->initialise_extractors();
472	}
473
474	sub end {
475	# potentially called at the end of each plugin pass
476	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
477
478	my ($self) = @_;
479	$self->finalise_extractors();
480	}
481
482	sub deinit {
483	# called only once, after all plugin passes have been done
484
485	my ($self) = @_;
486	}
487
488	# this function should be overridden to return 1
489	# in recursive plugins
490	sub is_recursive {
491	my $self = shift (@_);
492
493	return 0;
494	}
495
496	sub get_default_block_exp {
497	my $self = shift (@_);
498
499	return "";
500	}
501
502	sub get_default_process_exp {
503	my $self = shift (@_);
504
505	return "";
506	}
507
508	# default implementation is to do nothing.
509	sub store_block_files
510	{
511	my $self =shift (@_);
512	my ($filename) = @_;
513	return;
514	}
515
516	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
517	sub block_cover_image
518	{
519	my $self =shift (@_);
520	my ($filename) = @_;
521	if ($self->{'cover_image'}) {
522	my $coverfile = $filename;
523	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
524	if (!-e $coverfile) {
525	$coverfile =~ s/jpg$/JPG/;
526	}
527	if (-e $coverfile) {
528	$self->{'file_blocks'}->{$coverfile} = 1;
529	}
530	}
531
532	return;
533	}
534
535	sub metadata_read {
536	my $self = shift (@_);
537	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
538	# Keep track of filenames with same root but different extensions
539	# Used to support -associate_ext
540
541	my $associate_ext = $self->{'associate_ext'};
542	if ((defined $associate_ext) && ($associate_ext ne "")) {
543
544	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
545	if ((defined $file_prefix) && (defined $file_ext)) {
546
547	my $shared_fileroot = $self->{'shared_fileroot'};
548	if (!defined $shared_fileroot->{$file_prefix}) {
549	my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
550	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
551	}
552
553	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
554
555	my $process_exp = $self->{'process_exp'};
556
557	if ($file =~ m/$self->{'process_exp'}/) {
558	# This is the document the others should be tied to
559	$file_prefix_rec->{'tie_to'} = $file_ext;
560	}
561	else {
562	if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
563	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
564	}
565	}
566	}
567	}
568
569	# now check whether we are actually processing this
570	my $filename = $file;
571	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
572	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
573	return undef; # can't recognise
574	}
575
576	# do smart blocking if appropriate
577	if (defined $self->{'smart_block'}) {
578	$self->block_cover_image($filename);
579	$self->store_block_files($filename);
580	}
581
582	return 1;
583	}
584
585	sub tie_to_filename
586	{
587	my $self = shift (@_);
588
589	my ($file_ext,$file_prefix_rec) = @_;
590
591	if (defined $file_prefix_rec) {
592	my $tie_to = $file_prefix_rec->{'tie_to'};
593
594	if (defined $tie_to) {
595	if ($tie_to eq $file_ext) {
596	return 1;
597	}
598	}
599	}
600
601	return 0;
602	}
603
604	sub tie_to_assoc_file
605	{
606	my $self = shift (@_);
607	my ($file_ext,$file_prefix_rec) = @_;
608
609	if (defined $file_prefix_rec) {
610	my $tie_to = $file_prefix_rec->{'tie_to'};
611	if (defined $tie_to) {
612
613	my $exts = $file_prefix_rec->{'exts'};
614
615	my $has_file_ext = $exts->{$file_ext};
616
617	if ($has_file_ext) {
618	return 1;
619	}
620	}
621	}
622
623	return 0;
624	}
625
626
627	sub associate_with
628	{
629	my $self = shift (@_);
630	my ($file, $filename, $metadata) = @_;
631
632	my $associate_ext = $self->{'associate_ext'};
633
634
635	return 0 if (!$associate_ext);
636
637	# If file, see if matches with "tie_to" doc or is one of the
638	# associated filename extensions.
639
640	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
641	if ((defined $file_prefix) && (defined $file_ext)) {
642
643	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
644
645	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
646
647	# Set up gsdlassocfile_tobe
648
649	my $exts = $file_prefix_rec->{'exts'};
650
651	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
652	$metadata->{'gsdlassocfile_tobe'} = [];
653	}
654
655	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
656
657	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
658	foreach my $e (keys %$exts) {
659	my $assoc_file = "$full_prefix.$e";
660	my $mime_type = ""; # let system auto detect this
661	push(@$assoc_tobe,"$assoc_file:$mime_type:");
662	}
663	}
664	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
665	# a form of smart block
666
667	return 1;
668	}
669	}
670
671	return 0;
672	}
673
674
675	# The BasPlug read() function. This function does all the right things
676	# to make general options work for a given plugin. It calls the process()
677	# function which does all the work specific to a plugin (like the old
678	# read functions used to do). Most plugins should define their own
679	# process() function and let this read() function keep control.
680	#
681	# recursive plugins (e.g. RecPlug) and specialized plugins like those
682	# capable of processing many documents within a single file (e.g.
683	# GMLPlug) should normally implement their own version of read()
684	#
685	# Return number of files processed, undef if can't recognise, -1 if can't
686	# process
687	# Note that $base_dir might be "" and that $file might
688	# include directories
689
690	sub read {
691	my $self = shift (@_);
692
693	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
694
695	if ($self->is_recursive()) {
696	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
697	}
698
699	my $outhandle = $self->{'outhandle'};
700	my $smart_block = $self->{'smart_block'};
701	my $smart_block_BN = $self->{'smart_block_BN'};
702
703	my $filename = $file;
704	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
705
706	if ($self->associate_with($file,$filename,$metadata)) {
707	# a form of smart block
708	$self->{'num_blocked'} ++;
709	return 0; # blocked
710	}
711
712	if ($smart_block \|\| $smart_block_BN) {
713
714	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
715	$self->{'num_blocked'} ++;
716	return 0; # blocked
717	}
718	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
719	$self->{'num_blocked'} ++;
720	return 0; # blocked
721	}
722
723	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
724	return undef; # can't recognise
725	}
726	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
727
728	# Do encoding stuff
729	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
730
731	# create a new document
732	my $doc_obj = new doc ($filename, "indexed_doc");
733	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
734	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
735	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
736	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
737	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
738
739	my ($filemeta) = $file =~ /([^\\\/]+)$/;
740	# how do we know what encoding the filename is in?
741	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
742	if ($self->{'cover_image'}) {
743	$self->associate_cover_image($doc_obj, $filename);
744	}
745
746	# read in file ($text will be in utf8)
747	my $text = "";
748	$self->read_file ($filename, $encoding, $language, \$text);
749
750	if (!length ($text)) {
751	my $plugin_name = ref ($self);
752	if ($gli) {
753	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
754	}
755	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
756
757	my $failhandle = $self->{'failhandle'};
758	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
759	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
760	$self->{'num_not_processed'} ++;
761
762	return 0; # what should we return here?? error but don't want to pass it on
763	}
764
765	# include any metadata passed in from previous plugins
766	# note that this metadata is associated with the top level section
767
768	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
769
770	# do plugin specific processing of doc_obj
771	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
772	$text = '';
773	undef $text;
774	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
775	return -1;
776	}
777	$text='';
778	undef $text;
779
780	# do any automatic metadata extraction
781	$self->auto_extract_metadata ($doc_obj);
782
783	# add an OID
784	# see if there is a plugin-specific set_OID function...
785	if (defined ($self->can('set_OID'))) {
786	# it will need $doc_obj to set the Identifier metadata...
787	$self->set_OID($doc_obj);
788	} else {
789	# use the default set_OID() in doc.pm
790	$doc_obj->set_OID();
791	}
792
793	# process the document
794	$processor->process($doc_obj);
795
796	if(defined($self->{'places_filename'})){
797	&util::rm($self->{'places_filename'});
798	$self->{'places_filename'} = undef;
799	}
800
801	$self->{'num_processed'} ++;
802	undef $doc_obj;
803	return 1; # processed the file
804	}
805
806	# returns undef if file is rejected by the plugin
807	sub process {
808	my $self = shift (@_);
809	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
810
811	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
812	# die "Basplug::process function must be implemented in sub-class\n";
813
814	return undef; # never gets here
815	}
816
817	# uses the multiread package to read in the entire file pointed to
818	# by filename and loads the resulting text into $$textref. Input text
819	# may be in any of the encodings handled by multiread, output text
820	# will be in utf8
821	sub read_file {
822	my $self = shift (@_);
823	my ($filename, $encoding, $language, $textref) = @_;
824
825	if (!-r $filename)
826	{
827	my $outhandle = $self->{'outhandle'};
828	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
829	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
830	return;
831	}
832
833	$$textref = "";
834
835	if (!open (FILE, $filename)) {
836	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
837	die "\n";
838	}
839
840	if ($encoding eq "ascii") {
841	undef $/;
842	$$textref = <FILE>;
843	$/ = "\n";
844	} else {
845	my $reader = new multiread();
846	$reader->set_handle ('BasPlug::FILE');
847	$reader->set_encoding ($encoding);
848	$reader->read_file ($textref);
849
850	#Now segments chinese if the separate_cjk option is set
851	if ($self->{'separate_cjk'}) {
852	# segment the Chinese words
853	$$textref = &cnseg::segment($$textref);
854	}
855	}
856
857	close FILE;
858	}
859
860	sub filename_based_title
861	{
862	my $self = shift (@_);
863	my ($file) = @_;
864
865	my $file_derived_title = $file;
866	$file_derived_title =~ s/_/ /g;
867	$file_derived_title =~ s/\..*?$//;
868
869	return $file_derived_title;
870	}
871
872
873	sub title_fallback
874	{
875	my $self = shift (@_);
876	my ($doc_obj,$section,$file) = @_;
877
878	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
879
880	my $file_derived_title = $self->filename_based_title($file);
881	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
882	}
883	}
884
885	sub textcat_get_language_encoding {
886	my $self = shift (@_);
887	my ($filename) = @_;
888
889	my ($language, $encoding, $extracted_encoding);
890	if ($self->{'input_encoding'} eq "auto") {
891	# use textcat to automatically work out the input encoding and language
892	($language, $encoding) = $self->get_language_encoding ($filename);
893	} elsif ($self->{'extract_language'}) {
894	# use textcat to get language metadata
895	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
896	$encoding = $self->{'input_encoding'};
897	# don't print this message for english... english in utf8 is identical
898	# to english in iso-8859-1 (except for some punctuation). We don't have
899	# a language model for en_utf8, so textcat always says iso-8859-1!
900	if ($extracted_encoding ne $encoding && $language ne "en"
901	&& $self->{'verbosity'}) {
902	my $plugin_name = ref ($self);
903	my $outhandle = $self->{'outhandle'};
904	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
905	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
906	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
907	}
908	} else {
909	$language = $self->{'default_language'};
910	$encoding = $self->{'input_encoding'};
911	}
912	return ($language, $encoding);
913	}
914
915	# Uses textcat to work out the encoding and language of the text in
916	# $filename. All html tags are removed before processing.
917	# returns an array containing "language" and "encoding"
918	sub get_language_encoding {
919	my $self = shift (@_);
920	my ($filename) = @_;
921	my $outhandle = $self->{'outhandle'};
922	my $unicode_format = "";
923	# read in file
924	open (FILE, $filename) \|\| (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
925	undef $/;
926	my $text = <FILE>;
927	$/ = "\n";
928	close FILE;
929
930	# check if first few bytes have a Byte Order Marker
931	my $bom=substr($text,0,2); # check 16bit unicode
932	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
933	$unicode_format="unicode";
934	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
935	$unicode_format="unicode";
936	} else {
937	$bom=substr($text,0,3); # check utf-8
938	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
939	$unicode_format="utf8";
940	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
941	# $unicode_format="utf8";
942	}
943	}
944
945
946	# remove <title>stuff</title> -- as titles tend often to be in English
947	# for foreign language documents
948	$text =~ s/<title>(.\|\n)*?<\/title>//i;
949
950	# remove all HTML tags
951	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
952	if (ref($self) eq 'HTMLPlug' \|\|
953	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
954	$text =~ s/<[^>]*>//sg;
955	}
956
957	# get the language/encoding
958	my $results = $self->{'textcat'}->classify(\$text);
959
960	# if textcat returns 3 or less possibilities we'll use the
961	# first one in the list - otherwise use the defaults
962	if (scalar @$results > 3) {
963	my $best_encoding="";
964	if ($unicode_format) { # in case the first had a BOM
965	$best_encoding=$unicode_format;
966	} else {
967	my %guessed_encodings = ();
968	foreach my $result (@$results) {
969	$result =~ /([^\-]+)$/;
970	my $enc=$1;
971	if (!defined($guessed_encodings{$enc})) {
972	$guessed_encodings{$enc}=0;
973	}
974	$guessed_encodings{$enc}++;
975	}
976
977	$guessed_encodings{""}=-1; # for default best_encoding of ""
978	foreach my $enc (keys %guessed_encodings) {
979	if ($guessed_encodings{$enc} >
980	$guessed_encodings{$best_encoding}){
981	$best_encoding=$enc;
982	}
983	}
984	}
985
986	if ($self->{'input_encoding'} ne 'auto') {
987	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
988	gsprintf($outhandle,
989	"BasPlug: {BasPlug.could_not_extract_language}\n",
990	$filename, $self->{'default_language'});
991	}
992	return ($self->{'default_language'}, $self->{'input_encoding'});
993
994	} else {
995	if ($self->{'verbosity'}>2) {
996	gsprintf($outhandle,
997	"BasPlug: {BasPlug.could_not_extract_language}\n",
998	$filename, $self->{'default_language'});
999	}
1000	return ($self->{'default_language'}, $best_encoding);
1001	}
1002	}
1003
1004	# format language/encoding
1005	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
1006	if (!defined $language) {
1007	if ($self->{'verbosity'}>2) {
1008	gsprintf($outhandle,
1009	"BasPlug: {BasPlug.could_not_extract_language}\n",
1010	$filename, $self->{'default_language'});
1011	}
1012	$language = $self->{'default_language'};
1013	}
1014	if (!defined $encoding) {
1015	if ($self->{'verbosity'}>2) {
1016	gsprintf($outhandle,
1017	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
1018	$filename, $self->{'default_encoding'});
1019	}
1020	$encoding = $self->{'default_encoding'};
1021	}
1022
1023
1024	# check for equivalents where textcat doesn't have some encodings...
1025	# eg MS versions of standard encodings
1026	if ($encoding =~ /^iso_8859_(\d+)/) {
1027	my $iso = $1; # which variant of the iso standard?
1028	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1029	if ($text =~ /[\x80-\x9f]/) {
1030	# Western Europe
1031	if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1032	elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1033	elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1034	elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1035	elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1036	elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1037	elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1038	}
1039	}
1040
1041	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1042	!defined $encodings::encodings->{$encoding}) {
1043	if ($self->{'verbosity'}) {
1044	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1045	$filename, $encoding, $self->{'default_encoding'});
1046	}
1047	$encoding = $self->{'default_encoding'};
1048	}
1049
1050	return ($language, $encoding);
1051	}
1052
1053	# add any extra metadata that's been passed around from one
1054	# plugin to another.
1055	# extra_metadata uses add_utf8_metadata so it expects metadata values
1056	# to already be in utf8
1057	sub extra_metadata {
1058	my $self = shift (@_);
1059	my ($doc_obj, $cursection, $metadata) = @_;
1060
1061	foreach my $field (keys(%$metadata)) {
1062	# $metadata->{$field} may be an array reference
1063	if ($field eq "gsdlassocfile_tobe") {
1064	# 'gsdlassocfile_tobe' is artificially introduced metadata
1065	# that is used to signal that certain additional files should
1066	# be tied to this document. Useful in situations where a
1067	# metadata pass in the plugin pipeline works out some files
1068	# need to be associated with a document, but the document hasn't
1069	# been formed yet.
1070
1071	my $equiv_form = "";
1072	foreach my $gaf (@{$metadata->{$field}}) {
1073	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1074	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1075	my $filename = $full_filename;
1076
1077	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1078
1079	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1080	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1081	my $srcicon = "_icon".$doc_ext."_";
1082	my $end_doclink = "</a>";
1083
1084	$equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1085	}
1086	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1087	}
1088	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1089	map {
1090	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1091	} @{$metadata->{$field}};
1092	} else {
1093	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1094	}
1095	}
1096	}
1097
1098	# initialise metadata extractors
1099	sub initialise_extractors {
1100	my $self = shift (@_);
1101
1102	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1103	&acronym::initialise_acronyms();
1104	}
1105	}
1106
1107	# finalise metadata extractors
1108	sub finalise_extractors {
1109	my $self = shift (@_);
1110
1111	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1112	&acronym::finalise_acronyms();
1113	}
1114	}
1115
1116	# FIRSTNNN: extract the first NNN characters as metadata
1117	sub extract_first_NNNN_characters {
1118	my $self = shift (@_);
1119	my ($textref, $doc_obj, $thissection) = @_;
1120
1121	foreach my $size (split /,/, $self->{'first'}) {
1122	my $tmptext = $$textref;
1123	$tmptext =~ s/^\s+//;
1124	$tmptext =~ s/\s+$//;
1125	$tmptext =~ s/\s+/ /gs;
1126	$tmptext = substr ($tmptext, 0, $size);
1127	$tmptext =~ s/\s\S*$/…/;
1128	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1129	}
1130	}
1131
1132	sub extract_email {
1133	my $self = shift (@_);
1134	my ($textref, $doc_obj, $thissection) = @_;
1135	my $outhandle = $self->{'outhandle'};
1136
1137	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1138	if ($self->{'verbosity'} > 2);
1139
1140	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1141	@email = sort @email;
1142
1143	# if($self->{"new_extract_email"} == 0)
1144	# {
1145	# my @email2 = ();
1146	# foreach my $address (@email)
1147	# {
1148	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
1149	# {
1150	# push @email2, $address;
1151	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1152	# # print $outhandle " extracting $address\n"
1153	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1154	# if ($self->{'verbosity'} > 3);
1155	# }
1156	# }
1157	# }
1158	# else
1159	# {
1160	my $hashExistMail = {};
1161	foreach my $address (@email) {
1162	if (!(defined $hashExistMail->{$address}))
1163	{
1164	$hashExistMail->{$address} = 1;
1165	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1166	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1167	if ($self->{'verbosity'} > 3);
1168	}
1169	}
1170	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1171	if ($self->{'verbosity'} > 2);
1172	}
1173
1174	# extract metadata
1175	sub auto_extract_metadata {
1176
1177	my $self = shift (@_);
1178	my ($doc_obj) = @_;
1179
1180	if ($self->{'extract_email'}) {
1181	my $thissection = $doc_obj->get_top_section();
1182	while (defined $thissection) {
1183	my $text = $doc_obj->get_text($thissection);
1184	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1185	$thissection = $doc_obj->get_next_section ($thissection);
1186	}
1187	}
1188	if ($self->{'extract_placenames'}) {
1189	my $thissection = $doc_obj->get_top_section();
1190	while (defined $thissection) {
1191	my $text = $doc_obj->get_text($thissection);
1192	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1193	$thissection = $doc_obj->get_next_section ($thissection);
1194	}
1195	}
1196
1197	#adding kea keyphrases
1198	if ($self->{'extract_keyphrases'}) {
1199
1200	my $thissection = $doc_obj->get_top_section();
1201	my $text = "";
1202	my $list;
1203
1204	#loop through sections to gather whole doc
1205	while (defined $thissection) {
1206	my $sectiontext = $doc_obj->get_text($thissection);
1207	$text = $text.$sectiontext;
1208	$thissection = $doc_obj->get_next_section ($thissection);
1209	}
1210
1211	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1212	$list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
1213	} else { #otherwise call Kea with no options
1214	$list = &Kea::extract_KeyPhrases ($text);
1215	}
1216
1217	if ($list){
1218	# if a list of kea keyphrases was returned (ie not empty)
1219	if ($self->{'verbosity'}) {
1220	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1221	}
1222
1223	#add metadata to top section
1224	$thissection = $doc_obj->get_top_section();
1225
1226	# add all key phrases as one metadata
1227	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1228
1229	# add individual key phrases as multiple metadata
1230	foreach my $keyphrase (split(',', $list)) {
1231	$keyphrase =~ s/^\s+\|\s+$//g;
1232	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1233	}
1234	}
1235	} #end of kea
1236
1237	if ($self->{'first'}) {
1238	my $thissection = $doc_obj->get_top_section();
1239	while (defined $thissection) {
1240	my $text = $doc_obj->get_text($thissection);
1241	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1242	$thissection = $doc_obj->get_next_section ($thissection);
1243	}
1244	}
1245
1246	if ($self->{'extract_acronyms'}) {
1247	my $thissection = $doc_obj->get_top_section();
1248	while (defined $thissection) {
1249	my $text = $doc_obj->get_text($thissection);
1250	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1251	$thissection = $doc_obj->get_next_section ($thissection);
1252	}
1253	}
1254
1255	if ($self->{'markup_acronyms'}) {
1256	my $thissection = $doc_obj->get_top_section();
1257	while (defined $thissection) {
1258	my $text = $doc_obj->get_text($thissection);
1259	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1260	$doc_obj->delete_text($thissection);
1261	$doc_obj->add_text($thissection, $text);
1262	$thissection = $doc_obj->get_next_section ($thissection);
1263	}
1264	}
1265
1266	if($self->{'extract_historical_years'}) {
1267	my $thissection = $doc_obj->get_top_section();
1268	while (defined $thissection) {
1269
1270	my $text = $doc_obj->get_text($thissection);
1271	&DateExtract::get_date_metadata($text, $doc_obj,
1272	$thissection,
1273	$self->{'no_bibliography'},
1274	$self->{'maximum_year'},
1275	$self->{'maximum_century'});
1276	$thissection = $doc_obj->get_next_section ($thissection);
1277	}
1278	}
1279	}
1280
1281	# extract acronyms from a section in a document. progress is
1282	# reported to outhandle based on the verbosity. both the Acronym
1283	# and the AcronymKWIC metadata items are created.
1284
1285	sub extract_acronyms {
1286	my $self = shift (@_);
1287	my ($textref, $doc_obj, $thissection) = @_;
1288	my $outhandle = $self->{'outhandle'};
1289
1290	# print $outhandle " extracting acronyms ...\n"
1291	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1292	if ($self->{'verbosity'} > 2);
1293
1294	my $acro_array = &acronym::acronyms($textref);
1295
1296	foreach my $acro (@$acro_array) {
1297
1298	#check that this is the first time ...
1299	my $seen_before = "false";
1300	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1301	foreach my $thisAcro (@$previous_data) {
1302	if ($thisAcro eq $acro->to_string()) {
1303	$seen_before = "true";
1304	if ($self->{'verbosity'} >= 4) {
1305	gsprintf($outhandle, " {BasPlug.already_seen} " .
1306	$acro->to_string() . "\n");
1307	}
1308	}
1309	}
1310
1311	if ($seen_before eq "false") {
1312	#write it to the file ...
1313	$acro->write_to_file();
1314
1315	#do the normal acronym
1316	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1317	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1318	if ($self->{'verbosity'} > 3);
1319	}
1320	}
1321
1322	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1323	if ($self->{'verbosity'} > 2);
1324	}
1325
1326	sub markup_acronyms {
1327	my $self = shift (@_);
1328	my ($text, $doc_obj, $thissection) = @_;
1329	my $outhandle = $self->{'outhandle'};
1330
1331	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1332	if ($self->{'verbosity'} > 2);
1333
1334	#self is passed in to check for verbosity ...
1335	$text = &acronym::markup_acronyms($text, $self);
1336
1337	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1338	if ($self->{'verbosity'} > 2);
1339
1340	return $text;
1341	}
1342
1343	sub compile_stats {
1344	my $self = shift(@_);
1345	my ($stats) = @_;
1346
1347	$stats->{'num_processed'} += $self->{'num_processed'};
1348	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1349	$stats->{'num_archives'} += $self->{'num_archives'};
1350
1351	}
1352
1353	sub associate_cover_image {
1354	my $self = shift(@_);
1355	my ($doc_obj, $filename) = @_;
1356
1357	my $top_section=$doc_obj->get_top_section();
1358
1359	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1360	if (-e $filename) {
1361	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1362	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1363	} else {
1364	$filename =~ s/jpg$/JPG/;
1365	if (-e $filename) {
1366	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1367	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1368	}
1369	}
1370	}
1371
1372	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: