Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10155

Last change on this file since 10155 was 10155, checked in by davidb, 19 years ago
deinit subroutine added that balances out init routine. 'init' called only once when pipeline is set up. deinit now called when every pass using the pipeline has finished. Note this is different to the 'begin' and 'end' subroutines that can potentially be called just before a new round of file processing with the pipeline (note: presently only buildcol.pl makes multiple passes using the pipeline, importing only uses one pass).
Property svn:keywords set to `Author Date Id Revision`
File size: 39.3 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict; no strict 'subs';
39
40	use File::Basename;
41
42	use Kea;
43	use parsargv;
44	use multiread;
45	use encodings;
46	use cnseg;
47	use acronym;
48	use textcat;
49	use doc;
50	eval "require diagnostics"; # some perl distros (eg mac) don't have this
51	use DateExtract;
52	use ghtml;
53	use gsprintf 'gsprintf';
54	use printusage;
55
56	use GISBasPlug;
57
58	@BasPlug::ISA = ( GISBasPlug );
59
60	my $unicode_list =
61	[ { 'name' => "auto",
62	'desc' => "{BasPlug.input_encoding.auto}" },
63	{ 'name' => "ascii",
64	'desc' => "{BasPlug.input_encoding.ascii}" },
65	{ 'name' => "utf8",
66	'desc' => "{BasPlug.input_encoding.utf8}" },
67	{ 'name' => "unicode",
68	'desc' => "{BasPlug.input_encoding.unicode}" } ];
69
70	my $arguments =
71	[ { 'name' => "process_exp",
72	'desc' => "{BasPlug.process_exp}",
73	'type' => "regexp",
74	'deft' => "",
75	'reqd' => "no" },
76	{ 'name' => "block_exp",
77	'desc' => "{BasPlug.block_exp}",
78	'type' => "regexp",
79	'deft' => "",
80	'reqd' => "no" },
81	{ 'name' => "smart_block",
82	'desc' => "{BasPlug.smart_block}",
83	'type' => "flag",
84	'reqd' => "no" },
85	{ 'name' => "associate_ext",
86	'desc' => "{BasPlug.associate_ext}",
87	'type' => "string",
88	'reqd' => "no" },
89	{ 'name' => "input_encoding",
90	'desc' => "{BasPlug.input_encoding}",
91	'type' => "enum",
92	'list' => $unicode_list,
93	'reqd' => "no" ,
94	'deft' => "auto" } ,
95	{ 'name' => "default_encoding",
96	'desc' => "{BasPlug.default_encoding}",
97	'type' => "enum",
98	'list' => $unicode_list,
99	'reqd' => "no",
100	'deft' => "utf8" },
101	{ 'name' => "extract_language",
102	'desc' => "{BasPlug.extract_language}",
103	'type' => "flag",
104	'reqd' => "no" },
105	{ 'name' => "default_language",
106	'desc' => "{BasPlug.default_language}",
107	'type' => "language",
108	'deft' => "en",
109	'reqd' => "no" },
110	{ 'name' => "extract_acronyms",
111	'desc' => "{BasPlug.extract_acronyms}",
112	'type' => "flag",
113	'reqd' => "no" },
114	{ 'name' => "markup_acronyms",
115	'desc' => "{BasPlug.markup_acronyms}",
116	'type' => "flag",
117	'reqd' => "no" },
118	{ 'name' => "extract_keyphrases",
119	'desc' => "{BasPlug.extract_keyphrases}",
120	'type' => "flag",
121	'reqd' => "no" },
122	{ 'name' => "extract_keyphrase_options",
123	'desc' => "{BasPlug.extract_keyphrase_options}",
124	'type' => "string",
125	'deft' => "",
126	'reqd' => "no" },
127	{ 'name' => "first",
128	'desc' => "{BasPlug.first}",
129	'type' => "string",
130	'reqd' => "no" },
131	{ 'name' => "extract_email",
132	'desc' => "{BasPlug.extract_email}",
133	'type' => "flag",
134	'reqd' => "no" },
135	{ 'name' => "extract_historical_years",
136	'desc' => "{BasPlug.extract_historical_years}",
137	'type' => "flag",
138	'reqd' => "no" },
139	{ 'name' => "maximum_year",
140	'desc' => "{BasPlug.maximum_year}",
141	'type' => "int",
142	'deft' => (localtime)[5]+1900,
143	'reqd' => "no"},
144	{ 'name' => "maximum_century",
145	'desc' => "{BasPlug.maximum_century}",
146	'type' => "string",
147	'deft' => "",
148	'reqd' => "no" },
149	{ 'name' => "no_bibliography",
150	'desc' => "{BasPlug.no_bibliography}",
151	'type' => "flag",
152	'reqd' => "no"},
153	{ 'name' => "no_cover_image",
154	'desc' => "{BasPlug.no_cover_image}",
155	'type' => "flag",
156	'reqd' => "no" } ];
157
158	my $gis_arguments =
159	[ { 'name' => "extract_placenames",
160	'desc' => "{GISBasPlug.extract_placenames}",
161	'type' => "flag",
162	'reqd' => "no" },
163	{ 'name' => "gazetteer",
164	'desc' => "{GISBasPlug.gazetteer}",
165	'type' => "string",
166	'reqd' => "no" },
167	{ 'name' => "place_list",
168	'desc' => "{GISBasPlug.place_list}",
169	'type' => "flag",
170	'reqd' => "no" } ];
171
172
173	my $options = { 'name' => "BasPlug",
174	'desc' => "{BasPlug.desc}",
175	'abstract' => "yes",
176	'inherits' => "no",
177	'args' => $arguments };
178
179
180
181
182	sub get_arguments
183	{
184	my $self = shift(@_);
185	my $optionlistref = $self->{'option_list'};
186	my @optionlist = @$optionlistref;
187	my $pluginoptions = pop(@$optionlistref);
188	my $pluginarguments = $pluginoptions->{'args'};
189	return $pluginarguments;
190	}
191
192
193	sub print_xml_usage
194	{
195	my $self = shift(@_);
196
197	# XML output is always in UTF-8
198	gsprintf::output_strings_in_UTF8;
199
200	PrintUsage::print_xml_header();
201	$self->print_xml();
202	}
203
204
205	sub print_xml
206	{
207	my $self = shift(@_);
208
209	my $optionlistref = $self->{'option_list'};
210	my @optionlist = @$optionlistref;
211	my $pluginoptions = pop(@$optionlistref);
212	return if (!defined($pluginoptions));
213
214	gsprintf(STDERR, "<PlugInfo>\n");
215	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
216	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
217	$desc =~ s/</&lt;/g; # doubly escaped
218	$desc =~ s/>/&gt;/g;
219
220	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
222	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
223	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
224	gsprintf(STDERR, " <Arguments>\n");
225	if (defined($pluginoptions->{'args'})) {
226	&PrintUsage::print_options_xml($pluginoptions->{'args'});
227	}
228
229	# Recurse up the plugin hierarchy
230	$self->print_xml();
231
232	gsprintf(STDERR, " </Arguments>\n");
233	gsprintf(STDERR, "</PlugInfo>\n");
234	}
235
236
237	sub print_txt_usage
238	{
239	my $self = shift(@_);
240
241	# Print the usage message for a plugin (recursively)
242	my $descoffset = $self->determine_description_offset(0);
243	$self->print_plugin_usage($descoffset, 1);
244	}
245
246
247	sub determine_description_offset
248	{
249	my $self = shift(@_);
250	my $maxoffset = shift(@_);
251
252	my $optionlistref = $self->{'option_list'};
253	my @optionlist = @$optionlistref;
254	my $pluginoptions = pop(@$optionlistref);
255	return $maxoffset if (!defined($pluginoptions));
256
257	# Find the length of the longest option string of this plugin
258	my $pluginargs = $pluginoptions->{'args'};
259	if (defined($pluginargs)) {
260	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
261	if ($longest > $maxoffset) {
262	$maxoffset = $longest;
263	}
264	}
265
266	# Recurse up the plugin hierarchy
267	$maxoffset = $self->determine_description_offset($maxoffset);
268	$self->{'option_list'} = \@optionlist;
269	return $maxoffset;
270	}
271
272
273	sub print_plugin_usage
274	{
275	my $self = shift(@_);
276	my $descoffset = shift(@_);
277	my $isleafclass = shift(@_);
278
279	my $optionlistref = $self->{'option_list'};
280	my @optionlist = @$optionlistref;
281	my $pluginoptions = pop(@$optionlistref);
282	return if (!defined($pluginoptions));
283
284	my $pluginname = $pluginoptions->{'name'};
285	my $pluginargs = $pluginoptions->{'args'};
286	my $plugindesc = $pluginoptions->{'desc'};
287
288	# Produce the usage information using the data structure above
289	if ($isleafclass) {
290	if (defined($plugindesc)) {
291	gsprintf(STDERR, "$plugindesc\n\n");
292	}
293	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
294	}
295
296	# Display the plugin options, if there are some
297	if (defined($pluginargs)) {
298	# Calculate the column offset of the option descriptions
299	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
300
301	if ($isleafclass) {
302	gsprintf(STDERR, " {common.specific_options}:\n");
303	}
304	else {
305	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
306	}
307
308	# Display the plugin options
309	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
310	}
311
312	# Recurse up the plugin hierarchy
313	$self->print_plugin_usage($descoffset, 0);
314	$self->{'option_list'} = \@optionlist;
315	}
316
317
318	sub new {
319	my $class = shift (@_);
320	my $plugin_name = shift (@_);
321	my $self = {};
322	$self->{'plugin_type'} = "BasPlug";
323
324	if (GISBasPlug::has_mapdata()) {
325	push(@$arguments,@$gis_arguments);
326	}
327
328	my $enc = "^(";
329	map {$enc .= "$_\|";} keys %$encodings::encodings;
330	my $denc = $enc . "ascii\|utf8\|unicode)\$";
331	$enc .= "ascii\|utf8\|unicode\|auto)\$";
332
333	$self->{'outhandle'} = STDERR;
334	my $year = (localtime)[5]+1900;
335
336	$self->{'textcat'} = new textcat();
337
338	$self->{'num_processed'} = 0;
339	$self->{'num_not_processed'} = 0;
340	$self->{'num_blocked'} = 0;
341	$self->{'num_archives'} = 0;
342	$self->{'cover_image'} = 1; # cover image is on by default
343
344	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
345	$self->{'option_list'} = [ $options ];
346
347	my $no_cover_image = 0;
348	# general options available to all plugins
349	if (!parsargv::parse(\@_,
350	q^process_exp/.*/^, \$self->{'process_exp'},
351	q^block_exp/.*/^, \$self->{'block_exp'},
352	q^associate_ext/.*/^, \$self->{'associate_ext'},
353	q^extract_language^, \$self->{'extract_language'},
354	q^extract_acronyms^, \$self->{'extract_acronyms'},
355	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
356	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
357	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
358	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
359	q^extract_email^, \$self->{'extract_email'},
360	q^extract_placenames^, \$self->{'extract_placenames'},
361	q^gazetteer/.*/^, \$self->{'gazetteer'},
362	q^place_list^, \$self->{'place_list'},
363	q^markup_acronyms^, \$self->{'markup_acronyms'},
364	q^default_language/.{2}/en^, \$self->{'default_language'},
365	q^first/.*/^, \$self->{'first'},
366	q^extract_historical_years^, \$self->{'date_extract'},
367	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
368	q^no_bibliography^, \$self->{'no_biblio'},
369	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
370	q^no_cover_image^, \$no_cover_image,
371	q^separate_cjk^, \$self->{'separate_cjk'},
372	q^smart_block^, \$self->{'smart_block'},
373	q^smart_block_BN^, \$self->{'smart_block_BN'},
374	"allow_extra_options")) {
375
376	gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
377	bless $self, $class;
378	$self->print_txt_usage(""); # Use default resource bundle
379	die "\n";
380	}
381
382	my $associate_ext = $self->{'associate_ext'};
383	if ((defined $associate_ext) && ($associate_ext ne "")) {
384	my @exts = split(/,/,$associate_ext);
385
386	my %associate_ext_lookup = ();
387	foreach my $e (@exts) {
388	$associate_ext_lookup{$e} = 1;
389	}
390
391	$self->{'associate_ext_lookup'} = \%associate_ext_lookup;
392	}
393
394	$self->{'shared_fileroot'} = {};
395	$self->{'file_blocks'} = {};
396
397	$self->{'cover_image'} = 0 if ($no_cover_image);
398
399	if ($self->{'extract_placenames'}) {
400
401	my $outhandle = $self->{'outhandle'};
402
403	my $places_ref
404	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
405
406	if (!defined $places_ref) {
407	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
408	print $outhandle " No placename extraction will take place.\n";
409	$self->{'extract_placenames'} = undef;
410	}
411	else {
412	$self->{'places'} = $places_ref;
413	}
414	}
415	return bless $self, $class;
416	}
417
418	# initialize BasPlug options
419	# if init() is overridden in a sub-class, remember to call BasPlug::init()
420	sub init {
421	my $self = shift (@_);
422	my ($verbosity, $outhandle, $failhandle) = @_;
423
424	# verbosity is passed through from the processor
425	$self->{'verbosity'} = $verbosity;
426
427	# as are the outhandle and failhandle
428	$self->{'outhandle'} = $outhandle if defined $outhandle;
429	$self->{'failhandle'} = $failhandle;
430
431	# set process_exp and block_exp to defaults unless they were
432	# explicitly set
433
434	if ((!$self->is_recursive()) and
435	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
436
437	$self->{'process_exp'} = $self->get_default_process_exp ();
438	if ($self->{'process_exp'} eq "") {
439	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
440	}
441	}
442
443	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
444	$self->{'block_exp'} = $self->get_default_block_exp ();
445	}
446	}
447
448	sub begin {
449	my $self = shift (@_);
450	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
451	$self->initialise_extractors();
452	}
453
454	sub end {
455	# potentially called at the end of each plugin pass
456	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
457
458	my ($self) = @_;
459	$self->finalise_extractors();
460	}
461
462	sub deinit {
463	# called only once, after all plugin passes have been done
464
465	my ($self) = @_;
466	}
467
468	# this function should be overridden to return 1
469	# in recursive plugins
470	sub is_recursive {
471	my $self = shift (@_);
472
473	return 0;
474	}
475
476	sub get_default_block_exp {
477	my $self = shift (@_);
478
479	return "";
480	}
481
482	sub get_default_process_exp {
483	my $self = shift (@_);
484
485	return "";
486	}
487
488	# default implementation is to do nothing.
489	sub store_block_files
490	{
491	my $self =shift (@_);
492	my ($filename) = @_;
493	return;
494	}
495
496	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
497	sub block_cover_image
498	{
499	my $self =shift (@_);
500	my ($filename) = @_;
501	if ($self->{'cover_image'}) {
502	my $coverfile = $filename;
503	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
504	if (!-e $coverfile) {
505	$coverfile =~ s/jpg$/JPG/;
506	}
507	if (-e $coverfile) {
508	$self->{'file_blocks'}->{$coverfile} = 1;
509	}
510	}
511
512	return;
513	}
514
515	sub metadata_read {
516	my $self = shift (@_);
517	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
518	# Keep track of filenames with same root but different extensions
519	# Used to support -associate_ext
520
521	my $associate_ext = $self->{'associate_ext'};
522	if ((defined $associate_ext) && ($associate_ext ne "")) {
523
524	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
525	if ((defined $file_prefix) && (defined $file_ext)) {
526
527	my $shared_fileroot = $self->{'shared_fileroot'};
528	if (!defined $shared_fileroot->{$file_prefix}) {
529	my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
530	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
531	}
532
533	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
534
535	my $process_exp = $self->{'process_exp'};
536
537	if ($file =~ m/$self->{'process_exp'}/) {
538	# This is the document the others should be tied to
539	$file_prefix_rec->{'tie_to'} = $file_ext;
540	}
541	else {
542	if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
543	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
544	}
545	}
546	}
547	}
548
549	# now check whether we are actually processing this
550	my $filename = $file;
551	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
552	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
553	return undef; # can't recognise
554	}
555
556	# do smart blocking if appropriate
557	if (defined $self->{'smart_block'}) {
558	$self->block_cover_image($filename);
559	$self->store_block_files($filename);
560	}
561
562	return 1;
563	}
564
565	sub tie_to_filename
566	{
567	my $self = shift (@_);
568
569	my ($file_ext,$file_prefix_rec) = @_;
570
571	if (defined $file_prefix_rec) {
572	my $tie_to = $file_prefix_rec->{'tie_to'};
573
574	if (defined $tie_to) {
575	if ($tie_to eq $file_ext) {
576	return 1;
577	}
578	}
579	}
580
581	return 0;
582	}
583
584	sub tie_to_assoc_file
585	{
586	my $self = shift (@_);
587	my ($file_ext,$file_prefix_rec) = @_;
588
589	if (defined $file_prefix_rec) {
590	my $tie_to = $file_prefix_rec->{'tie_to'};
591	if (defined $tie_to) {
592
593	my $exts = $file_prefix_rec->{'exts'};
594
595	my $has_file_ext = $exts->{$file_ext};
596
597	if ($has_file_ext) {
598	return 1;
599	}
600	}
601	}
602
603	return 0;
604	}
605
606
607	sub associate_with
608	{
609	my $self = shift (@_);
610	my ($file, $filename, $metadata) = @_;
611
612	my $associate_ext = $self->{'associate_ext'};
613
614
615	return 0 if (!$associate_ext);
616
617	# If file, see if matches with "tie_to" doc or is one of the
618	# associated filename extensions.
619
620	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
621	if ((defined $file_prefix) && (defined $file_ext)) {
622
623	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
624
625	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
626
627	# Set up gsdlassocfile_tobe
628
629	my $exts = $file_prefix_rec->{'exts'};
630
631	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
632	$metadata->{'gsdlassocfile_tobe'} = [];
633	}
634
635	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
636
637	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
638	foreach my $e (keys %$exts) {
639	my $assoc_file = "$full_prefix.$e";
640	my $mime_type = ""; # let system auto detect this
641	push(@$assoc_tobe,"$assoc_file:$mime_type:");
642	}
643	}
644	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
645	# a form of smart block
646
647	return 1;
648	}
649	}
650
651	return 0;
652	}
653
654
655	# The BasPlug read() function. This function does all the right things
656	# to make general options work for a given plugin. It calls the process()
657	# function which does all the work specific to a plugin (like the old
658	# read functions used to do). Most plugins should define their own
659	# process() function and let this read() function keep control.
660	#
661	# recursive plugins (e.g. RecPlug) and specialized plugins like those
662	# capable of processing many documents within a single file (e.g.
663	# GMLPlug) should normally implement their own version of read()
664	#
665	# Return number of files processed, undef if can't recognise, -1 if can't
666	# process
667	# Note that $base_dir might be "" and that $file might
668	# include directories
669
670	sub read {
671	my $self = shift (@_);
672
673	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
674
675	if ($self->is_recursive()) {
676	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
677	}
678
679	my $outhandle = $self->{'outhandle'};
680	my $smart_block = $self->{'smart_block'};
681	my $smart_block_BN = $self->{'smart_block_BN'};
682
683	my $filename = $file;
684	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
685
686	if ($self->associate_with($file,$filename,$metadata)) {
687	# a form of smart block
688	$self->{'num_blocked'} ++;
689	return 0; # blocked
690	}
691
692	if ($smart_block \|\| $smart_block_BN) {
693
694	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
695	$self->{'num_blocked'} ++;
696	return 0; # blocked
697	}
698	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
699	$self->{'num_blocked'} ++;
700	return 0; # blocked
701	}
702
703	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
704	return undef; # can't recognise
705	}
706	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
707
708	# Do encoding stuff
709	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
710
711	# create a new document
712	my $doc_obj = new doc ($filename, "indexed_doc");
713	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
714	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
715	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
716	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
717	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
718
719	my ($filemeta) = $file =~ /([^\\\/]+)$/;
720	# how do we know what encoding the filename is in?
721	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
722	if ($self->{'cover_image'}) {
723	$self->associate_cover_image($doc_obj, $filename);
724	}
725
726	# read in file ($text will be in utf8)
727	my $text = "";
728	$self->read_file ($filename, $encoding, $language, \$text);
729
730	if (!length ($text)) {
731	my $plugin_name = ref ($self);
732	if ($gli) {
733	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
734	}
735	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
736
737	my $failhandle = $self->{'failhandle'};
738	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
739	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
740	$self->{'num_not_processed'} ++;
741
742	return 0; # what should we return here?? error but don't want to pass it on
743	}
744
745	# include any metadata passed in from previous plugins
746	# note that this metadata is associated with the top level section
747
748	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
749
750	# do plugin specific processing of doc_obj
751	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
752	$text = '';
753	undef $text;
754	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
755	return -1;
756	}
757	$text='';
758	undef $text;
759
760	# do any automatic metadata extraction
761	$self->auto_extract_metadata ($doc_obj);
762
763	# add an OID
764	# see if there is a plugin-specific set_OID function...
765	if (defined ($self->can('set_OID'))) {
766	# it will need $doc_obj to set the Identifier metadata...
767	$self->set_OID($doc_obj);
768	} else {
769	# use the default set_OID() in doc.pm
770	$doc_obj->set_OID();
771	}
772
773	# process the document
774	$processor->process($doc_obj);
775
776	if(defined($self->{'places_filename'})){
777	&util::rm($self->{'places_filename'});
778	$self->{'places_filename'} = undef;
779	}
780
781	$self->{'num_processed'} ++;
782	undef $doc_obj;
783	return 1; # processed the file
784	}
785
786	# returns undef if file is rejected by the plugin
787	sub process {
788	my $self = shift (@_);
789	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
790
791	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
792	# die "Basplug::process function must be implemented in sub-class\n";
793
794	return undef; # never gets here
795	}
796
797	# uses the multiread package to read in the entire file pointed to
798	# by filename and loads the resulting text into $$textref. Input text
799	# may be in any of the encodings handled by multiread, output text
800	# will be in utf8
801	sub read_file {
802	my $self = shift (@_);
803	my ($filename, $encoding, $language, $textref) = @_;
804
805	if (!-r $filename)
806	{
807	my $outhandle = $self->{'outhandle'};
808	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
809	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
810	return;
811	}
812
813	$$textref = "";
814
815	if (!open (FILE, $filename)) {
816	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
817	die "\n";
818	}
819
820	if ($encoding eq "ascii") {
821	undef $/;
822	$$textref = <FILE>;
823	$/ = "\n";
824	} else {
825	my $reader = new multiread();
826	$reader->set_handle ('BasPlug::FILE');
827	$reader->set_encoding ($encoding);
828	$reader->read_file ($textref);
829
830	#Now segments chinese if the separate_cjk option is set
831	if ($self->{'separate_cjk'}) {
832	# segment the Chinese words
833	$$textref = &cnseg::segment($$textref);
834	}
835	}
836
837	close FILE;
838	}
839
840	sub filename_based_title
841	{
842	my $self = shift (@_);
843	my ($file) = @_;
844
845	my $file_derived_title = $file;
846	$file_derived_title =~ s/_/ /g;
847	$file_derived_title =~ s/\..*?$//;
848
849	return $file_derived_title;
850	}
851
852
853	sub title_fallback
854	{
855	my $self = shift (@_);
856	my ($doc_obj,$section,$file) = @_;
857
858	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
859
860	my $file_derived_title = $self->filename_based_title($file);
861	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
862	}
863	}
864
865	sub textcat_get_language_encoding {
866	my $self = shift (@_);
867	my ($filename) = @_;
868
869	my ($language, $encoding, $extracted_encoding);
870	if ($self->{'input_encoding'} eq "auto") {
871	# use textcat to automatically work out the input encoding and language
872	($language, $encoding) = $self->get_language_encoding ($filename);
873	} elsif ($self->{'extract_language'}) {
874	# use textcat to get language metadata
875	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
876	$encoding = $self->{'input_encoding'};
877	# don't print this message for english... english in utf8 is identical
878	# to english in iso-8859-1 (except for some punctuation). We don't have
879	# a language model for en_utf8, so textcat always says iso-8859-1!
880	if ($extracted_encoding ne $encoding && $language ne "en"
881	&& $self->{'verbosity'}) {
882	my $plugin_name = ref ($self);
883	my $outhandle = $self->{'outhandle'};
884	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
885	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
886	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
887	}
888	} else {
889	$language = $self->{'default_language'};
890	$encoding = $self->{'input_encoding'};
891	}
892	return ($language, $encoding);
893	}
894
895	# Uses textcat to work out the encoding and language of the text in
896	# $filename. All html tags are removed before processing.
897	# returns an array containing "language" and "encoding"
898	sub get_language_encoding {
899	my $self = shift (@_);
900	my ($filename) = @_;
901	my $outhandle = $self->{'outhandle'};
902	my $unicode_format = "";
903	# read in file
904	open (FILE, $filename) \|\| (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
905	undef $/;
906	my $text = <FILE>;
907	$/ = "\n";
908	close FILE;
909
910	# check if first few bytes have a Byte Order Marker
911	my $bom=substr($text,0,2); # check 16bit unicode
912	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
913	$unicode_format="unicode";
914	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
915	$unicode_format="unicode";
916	} else {
917	$bom=substr($text,0,3); # check utf-8
918	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
919	$unicode_format="utf8";
920	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
921	# $unicode_format="utf8";
922	}
923	}
924
925
926	# remove <title>stuff</title> -- as titles tend often to be in English
927	# for foreign language documents
928	$text =~ s/<title>(.\|\n)*?<\/title>//i;
929
930	# remove all HTML tags
931	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
932	if (ref($self) eq 'HTMLPlug' \|\|
933	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
934	$text =~ s/<[^>]*>//sg;
935	}
936
937	# get the language/encoding
938	my $results = $self->{'textcat'}->classify(\$text);
939
940	# if textcat returns 3 or less possibilities we'll use the
941	# first one in the list - otherwise use the defaults
942	if (scalar @$results > 3) {
943	my $best_encoding="";
944	if ($unicode_format) { # in case the first had a BOM
945	$best_encoding=$unicode_format;
946	} else {
947	my %guessed_encodings = ();
948	foreach my $result (@$results) {
949	$result =~ /([^\-]+)$/;
950	my $enc=$1;
951	if (!defined($guessed_encodings{$enc})) {
952	$guessed_encodings{$enc}=0;
953	}
954	$guessed_encodings{$enc}++;
955	}
956
957	$guessed_encodings{""}=-1; # for default best_encoding of ""
958	foreach my $enc (keys %guessed_encodings) {
959	if ($guessed_encodings{$enc} >
960	$guessed_encodings{$best_encoding}){
961	$best_encoding=$enc;
962	}
963	}
964	}
965
966	if ($self->{'input_encoding'} ne 'auto') {
967	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
968	gsprintf($outhandle,
969	"BasPlug: {BasPlug.could_not_extract_language}\n",
970	$filename, $self->{'default_language'});
971	}
972	return ($self->{'default_language'}, $self->{'input_encoding'});
973
974	} else {
975	if ($self->{'verbosity'}>2) {
976	gsprintf($outhandle,
977	"BasPlug: {BasPlug.could_not_extract_language}\n",
978	$filename, $self->{'default_language'});
979	}
980	return ($self->{'default_language'}, $best_encoding);
981	}
982	}
983
984	# format language/encoding
985	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
986	if (!defined $language) {
987	if ($self->{'verbosity'}>2) {
988	gsprintf($outhandle,
989	"BasPlug: {BasPlug.could_not_extract_language}\n",
990	$filename, $self->{'default_language'});
991	}
992	$language = $self->{'default_language'};
993	}
994	if (!defined $encoding) {
995	if ($self->{'verbosity'}>2) {
996	gsprintf($outhandle,
997	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
998	$filename, $self->{'default_encoding'});
999	}
1000	$encoding = $self->{'default_encoding'};
1001	}
1002
1003
1004	# check for equivalents where textcat doesn't have some encodings...
1005	# eg MS versions of standard encodings
1006	if ($encoding =~ /^iso_8859_(\d+)/) {
1007	my $iso = $1; # which variant of the iso standard?
1008	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1009	if ($text =~ /[\x80-\x9f]/) {
1010	# Western Europe
1011	if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1012	elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1013	elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1014	elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1015	elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1016	elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1017	elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1018	}
1019	}
1020
1021	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1022	!defined $encodings::encodings->{$encoding}) {
1023	if ($self->{'verbosity'}) {
1024	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1025	$filename, $encoding, $self->{'default_encoding'});
1026	}
1027	$encoding = $self->{'default_encoding'};
1028	}
1029
1030	return ($language, $encoding);
1031	}
1032
1033	# add any extra metadata that's been passed around from one
1034	# plugin to another.
1035	# extra_metadata uses add_utf8_metadata so it expects metadata values
1036	# to already be in utf8
1037	sub extra_metadata {
1038	my $self = shift (@_);
1039	my ($doc_obj, $cursection, $metadata) = @_;
1040
1041	foreach my $field (keys(%$metadata)) {
1042	# $metadata->{$field} may be an array reference
1043	if ($field eq "gsdlassocfile_tobe") {
1044	# 'gsdlassocfile_tobe' is artificially introduced metadata
1045	# that is used to signal that certain additional files should
1046	# be tied to this document. Useful in situations where a
1047	# metadata pass in the plugin pipeline works out some files
1048	# need to be associated with a document, but the document hasn't
1049	# been formed yet.
1050
1051	my $equiv_form = "";
1052	foreach my $gaf (@{$metadata->{$field}}) {
1053	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1054	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1055	my $filename = $full_filename;
1056
1057	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1058
1059	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1060	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1061	my $srcicon = "_icon".$doc_ext."_";
1062	my $end_doclink = "</a>";
1063
1064	$equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1065	}
1066	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1067	}
1068	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1069	map {
1070	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1071	} @{$metadata->{$field}};
1072	} else {
1073	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1074	}
1075	}
1076	}
1077
1078	# initialise metadata extractors
1079	sub initialise_extractors {
1080	my $self = shift (@_);
1081
1082	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1083	&acronym::initialise_acronyms();
1084	}
1085	}
1086
1087	# finalise metadata extractors
1088	sub finalise_extractors {
1089	my $self = shift (@_);
1090
1091	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1092	&acronym::finalise_acronyms();
1093	}
1094	}
1095
1096	# FIRSTNNN: extract the first NNN characters as metadata
1097	sub extract_first_NNNN_characters {
1098	my $self = shift (@_);
1099	my ($textref, $doc_obj, $thissection) = @_;
1100
1101	foreach my $size (split /,/, $self->{'first'}) {
1102	my $tmptext = $$textref;
1103	$tmptext =~ s/^\s+//;
1104	$tmptext =~ s/\s+$//;
1105	$tmptext =~ s/\s+/ /gs;
1106	$tmptext = substr ($tmptext, 0, $size);
1107	$tmptext =~ s/\s\S*$/…/;
1108	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1109	}
1110	}
1111
1112	sub extract_email {
1113	my $self = shift (@_);
1114	my ($textref, $doc_obj, $thissection) = @_;
1115	my $outhandle = $self->{'outhandle'};
1116
1117	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1118	if ($self->{'verbosity'} > 2);
1119
1120	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1121	@email = sort @email;
1122
1123	my @email2 = ();
1124	foreach my $address (@email) {
1125	if (!(join(" ",@email2) =~ m/$address/ )) {
1126	push @email2, $address;
1127	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1128	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1129	if ($self->{'verbosity'} > 3);
1130	}
1131	}
1132	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1133	if ($self->{'verbosity'} > 2);
1134	}
1135
1136	# extract metadata
1137	sub auto_extract_metadata {
1138
1139	my $self = shift (@_);
1140	my ($doc_obj) = @_;
1141
1142	if ($self->{'extract_email'}) {
1143	my $thissection = $doc_obj->get_top_section();
1144	while (defined $thissection) {
1145	my $text = $doc_obj->get_text($thissection);
1146	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1147	$thissection = $doc_obj->get_next_section ($thissection);
1148	}
1149	}
1150	if ($self->{'extract_placenames'}) {
1151	my $thissection = $doc_obj->get_top_section();
1152	while (defined $thissection) {
1153	my $text = $doc_obj->get_text($thissection);
1154	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1155	$thissection = $doc_obj->get_next_section ($thissection);
1156	}
1157	}
1158
1159	# adding kea keyphrases
1160	if ($self->{'kea'}) {
1161
1162	my $thissection = $doc_obj->get_top_section();
1163	my $text = "";
1164	my $list;
1165
1166	#loop through sections to gather whole doc
1167	while (defined $thissection) {
1168	my $sectiontext = $doc_obj->get_text($thissection);
1169	$text = $text.$sectiontext;
1170	$thissection = $doc_obj->get_next_section ($thissection);
1171	}
1172
1173	if ($self->{'kea_options'}) {
1174	#if kea options flag is set, call Kea with specified options
1175	$list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
1176	} else {
1177	#otherwise call Kea with no options
1178	$list = &Kea::extract_KeyPhrases ($text);
1179	}
1180	if ($list){
1181	# if a list of kea keyphrases was returned (ie not empty)
1182	if ($self->{'verbosity'}) {
1183	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1184	}
1185
1186	#add metadata to top section
1187	$thissection = $doc_obj->get_top_section();
1188
1189	# add all key phrases as one metadata
1190	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1191
1192	# add individual key phrases as multiple metadata
1193	foreach my $keyphrase (split(',', $list)) {
1194	$keyphrase =~ s/^\s+\|\s+$//g;
1195	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1196	}
1197	}
1198	} #end of kea
1199
1200	if ($self->{'first'}) {
1201	my $thissection = $doc_obj->get_top_section();
1202	while (defined $thissection) {
1203	my $text = $doc_obj->get_text($thissection);
1204	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1205	$thissection = $doc_obj->get_next_section ($thissection);
1206	}
1207	}
1208
1209	if ($self->{'extract_acronyms'}) {
1210	my $thissection = $doc_obj->get_top_section();
1211	while (defined $thissection) {
1212	my $text = $doc_obj->get_text($thissection);
1213	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1214	$thissection = $doc_obj->get_next_section ($thissection);
1215	}
1216	}
1217
1218	if ($self->{'markup_acronyms'}) {
1219	my $thissection = $doc_obj->get_top_section();
1220	while (defined $thissection) {
1221	my $text = $doc_obj->get_text($thissection);
1222	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1223	$doc_obj->delete_text($thissection);
1224	$doc_obj->add_text($thissection, $text);
1225	$thissection = $doc_obj->get_next_section ($thissection);
1226	}
1227	}
1228
1229	if($self->{'date_extract'}) {
1230	my $thissection = $doc_obj->get_top_section();
1231	while (defined $thissection) {
1232
1233	my $text = $doc_obj->get_text($thissection);
1234	&DateExtract::get_date_metadata($text, $doc_obj,
1235	$thissection,
1236	$self->{'no_biblio'},
1237	$self->{'max_year'},
1238	$self->{'max_century'});
1239	$thissection = $doc_obj->get_next_section ($thissection);
1240	}
1241	}
1242	}
1243
1244	# extract acronyms from a section in a document. progress is
1245	# reported to outhandle based on the verbosity. both the Acronym
1246	# and the AcronymKWIC metadata items are created.
1247
1248	sub extract_acronyms {
1249	my $self = shift (@_);
1250	my ($textref, $doc_obj, $thissection) = @_;
1251	my $outhandle = $self->{'outhandle'};
1252
1253	# print $outhandle " extracting acronyms ...\n"
1254	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1255	if ($self->{'verbosity'} > 2);
1256
1257	my $acro_array = &acronym::acronyms($textref);
1258
1259	foreach my $acro (@$acro_array) {
1260
1261	#check that this is the first time ...
1262	my $seen_before = "false";
1263	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1264	foreach my $thisAcro (@$previous_data) {
1265	if ($thisAcro eq $acro->to_string()) {
1266	$seen_before = "true";
1267	if ($self->{'verbosity'} >= 4) {
1268	gsprintf($outhandle, " {BasPlug.already_seen} " .
1269	$acro->to_string() . "\n");
1270	}
1271	}
1272	}
1273
1274	if ($seen_before eq "false") {
1275	#write it to the file ...
1276	$acro->write_to_file();
1277
1278	#do the normal acronym
1279	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1280	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1281	if ($self->{'verbosity'} > 3);
1282	}
1283	}
1284
1285	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1286	if ($self->{'verbosity'} > 2);
1287	}
1288
1289	sub markup_acronyms {
1290	my $self = shift (@_);
1291	my ($text, $doc_obj, $thissection) = @_;
1292	my $outhandle = $self->{'outhandle'};
1293
1294	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1295	if ($self->{'verbosity'} > 2);
1296
1297	#self is passed in to check for verbosity ...
1298	$text = &acronym::markup_acronyms($text, $self);
1299
1300	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1301	if ($self->{'verbosity'} > 2);
1302
1303	return $text;
1304	}
1305
1306	sub compile_stats {
1307	my $self = shift(@_);
1308	my ($stats) = @_;
1309
1310	$stats->{'num_processed'} += $self->{'num_processed'};
1311	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1312	$stats->{'num_archives'} += $self->{'num_archives'};
1313
1314	}
1315
1316	sub associate_cover_image {
1317	my $self = shift(@_);
1318	my ($doc_obj, $filename) = @_;
1319
1320	my $top_section=$doc_obj->get_top_section();
1321
1322	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1323	if (-e $filename) {
1324	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1325	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1326	} else {
1327	$filename =~ s/jpg$/JPG/;
1328	if (-e $filename) {
1329	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1330	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1331	}
1332	}
1333	}
1334
1335	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: