Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 9706

Last change on this file since 9706 was 9703, checked in by mdewsnip, 19 years ago
Improvement to previous change so "file not processed" messages are seen in Expert mode.
Property svn:keywords set to `Author Date Id Revision`
File size: 39.0 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict; no strict 'subs';
39
40	use File::Basename;
41
42	use Kea;
43	use parsargv;
44	use multiread;
45	use encodings;
46	use cnseg;
47	use acronym;
48	use textcat;
49	use doc;
50	eval "require diagnostics"; # some perl distros (eg mac) don't have this
51	use DateExtract;
52	use ghtml;
53	use gsprintf 'gsprintf';
54	use printusage;
55
56	use GISBasPlug;
57
58	@BasPlug::ISA = ( GISBasPlug );
59
60	my $unicode_list =
61	[ { 'name' => "auto",
62	'desc' => "{BasPlug.input_encoding.auto}" },
63	{ 'name' => "ascii",
64	'desc' => "{BasPlug.input_encoding.ascii}" },
65	{ 'name' => "utf8",
66	'desc' => "{BasPlug.input_encoding.utf8}" },
67	{ 'name' => "unicode",
68	'desc' => "{BasPlug.input_encoding.unicode}" } ];
69
70	my $arguments =
71	[ { 'name' => "process_exp",
72	'desc' => "{BasPlug.process_exp}",
73	'type' => "regexp",
74	'deft' => "",
75	'reqd' => "no" },
76	{ 'name' => "block_exp",
77	'desc' => "{BasPlug.block_exp}",
78	'type' => "regexp",
79	'deft' => "",
80	'reqd' => "no" },
81	{ 'name' => "smart_block",
82	'desc' => "{BasPlug.smart_block}",
83	'type' => "flag",
84	'reqd' => "no" },
85	{ 'name' => "associate_ext",
86	'desc' => "{BasPlug.associate_ext}",
87	'type' => "string",
88	'reqd' => "no" },
89	{ 'name' => "input_encoding",
90	'desc' => "{BasPlug.input_encoding}",
91	'type' => "enum",
92	'list' => $unicode_list,
93	'reqd' => "no" ,
94	'deft' => "auto" } ,
95	{ 'name' => "default_encoding",
96	'desc' => "{BasPlug.default_encoding}",
97	'type' => "enum",
98	'list' => $unicode_list,
99	'reqd' => "no",
100	'deft' => "utf8" },
101	{ 'name' => "extract_language",
102	'desc' => "{BasPlug.extract_language}",
103	'type' => "flag",
104	'reqd' => "no" },
105	{ 'name' => "default_language",
106	'desc' => "{BasPlug.default_language}",
107	'type' => "language",
108	'deft' => "en",
109	'reqd' => "no" },
110	{ 'name' => "extract_acronyms",
111	'desc' => "{BasPlug.extract_acronyms}",
112	'type' => "flag",
113	'reqd' => "no" },
114	{ 'name' => "markup_acronyms",
115	'desc' => "{BasPlug.markup_acronyms}",
116	'type' => "flag",
117	'reqd' => "no" },
118	{ 'name' => "extract_keyphrases",
119	'desc' => "{BasPlug.extract_keyphrases}",
120	'type' => "flag",
121	'reqd' => "no" },
122	{ 'name' => "extract_keyphrase_options",
123	'desc' => "{BasPlug.extract_keyphrase_options}",
124	'type' => "string",
125	'deft' => "",
126	'reqd' => "no" },
127	{ 'name' => "first",
128	'desc' => "{BasPlug.first}",
129	'type' => "string",
130	'reqd' => "no" },
131	{ 'name' => "extract_email",
132	'desc' => "{BasPlug.extract_email}",
133	'type' => "flag",
134	'reqd' => "no" },
135	{ 'name' => "extract_historical_years",
136	'desc' => "{BasPlug.extract_historical_years}",
137	'type' => "flag",
138	'reqd' => "no" },
139	{ 'name' => "maximum_year",
140	'desc' => "{BasPlug.maximum_year}",
141	'type' => "int",
142	'deft' => (localtime)[5]+1900,
143	'reqd' => "no"},
144	{ 'name' => "maximum_century",
145	'desc' => "{BasPlug.maximum_century}",
146	'type' => "string",
147	'deft' => "",
148	'reqd' => "no" },
149	{ 'name' => "no_bibliography",
150	'desc' => "{BasPlug.no_bibliography}",
151	'type' => "flag",
152	'reqd' => "no"},
153	{ 'name' => "no_cover_image",
154	'desc' => "{BasPlug.no_cover_image}",
155	'type' => "flag",
156	'reqd' => "no" } ];
157
158	my $gis_arguments =
159	[ { 'name' => "extract_placenames",
160	'desc' => "{GISBasPlug.extract_placenames}",
161	'type' => "flag",
162	'reqd' => "no" },
163	{ 'name' => "gazetteer",
164	'desc' => "{GISBasPlug.gazetteer}",
165	'type' => "string",
166	'reqd' => "no" },
167	{ 'name' => "place_list",
168	'desc' => "{GISBasPlug.place_list}",
169	'type' => "flag",
170	'reqd' => "no" } ];
171
172
173	my $options = { 'name' => "BasPlug",
174	'desc' => "{BasPlug.desc}",
175	'abstract' => "yes",
176	'inherits' => "no",
177	'args' => $arguments };
178
179
180
181
182	sub get_arguments
183	{
184	my $self = shift(@_);
185	my $optionlistref = $self->{'option_list'};
186	my @optionlist = @$optionlistref;
187	my $pluginoptions = pop(@$optionlistref);
188	my $pluginarguments = $pluginoptions->{'args'};
189	return $pluginarguments;
190	}
191
192
193	sub print_xml_usage
194	{
195	my $self = shift(@_);
196
197	# XML output is always in UTF-8
198	gsprintf::output_strings_in_UTF8;
199
200	PrintUsage::print_xml_header();
201	$self->print_xml();
202	}
203
204
205	sub print_xml
206	{
207	my $self = shift(@_);
208
209	my $optionlistref = $self->{'option_list'};
210	my @optionlist = @$optionlistref;
211	my $pluginoptions = pop(@$optionlistref);
212	return if (!defined($pluginoptions));
213
214	gsprintf(STDERR, "<PlugInfo>\n");
215	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
216	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
217	$desc =~ s/</&lt;/g; # doubly escaped
218	$desc =~ s/>/&gt;/g;
219
220	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
221	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
222	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
223	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
224	gsprintf(STDERR, " <Arguments>\n");
225	if (defined($pluginoptions->{'args'})) {
226	&PrintUsage::print_options_xml($pluginoptions->{'args'});
227	}
228
229	# Recurse up the plugin hierarchy
230	$self->print_xml();
231
232	gsprintf(STDERR, " </Arguments>\n");
233	gsprintf(STDERR, "</PlugInfo>\n");
234	}
235
236
237	sub print_txt_usage
238	{
239	my $self = shift(@_);
240
241	# Print the usage message for a plugin (recursively)
242	my $descoffset = $self->determine_description_offset(0);
243	$self->print_plugin_usage($descoffset, 1);
244	}
245
246
247	sub determine_description_offset
248	{
249	my $self = shift(@_);
250	my $maxoffset = shift(@_);
251
252	my $optionlistref = $self->{'option_list'};
253	my @optionlist = @$optionlistref;
254	my $pluginoptions = pop(@$optionlistref);
255	return $maxoffset if (!defined($pluginoptions));
256
257	# Find the length of the longest option string of this plugin
258	my $pluginargs = $pluginoptions->{'args'};
259	if (defined($pluginargs)) {
260	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
261	if ($longest > $maxoffset) {
262	$maxoffset = $longest;
263	}
264	}
265
266	# Recurse up the plugin hierarchy
267	$maxoffset = $self->determine_description_offset($maxoffset);
268	$self->{'option_list'} = \@optionlist;
269	return $maxoffset;
270	}
271
272
273	sub print_plugin_usage
274	{
275	my $self = shift(@_);
276	my $descoffset = shift(@_);
277	my $isleafclass = shift(@_);
278
279	my $optionlistref = $self->{'option_list'};
280	my @optionlist = @$optionlistref;
281	my $pluginoptions = pop(@$optionlistref);
282	return if (!defined($pluginoptions));
283
284	my $pluginname = $pluginoptions->{'name'};
285	my $pluginargs = $pluginoptions->{'args'};
286	my $plugindesc = $pluginoptions->{'desc'};
287
288	# Produce the usage information using the data structure above
289	if ($isleafclass) {
290	if (defined($plugindesc)) {
291	gsprintf(STDERR, "$plugindesc\n\n");
292	}
293	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
294	}
295
296	# Display the plugin options, if there are some
297	if (defined($pluginargs)) {
298	# Calculate the column offset of the option descriptions
299	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
300
301	if ($isleafclass) {
302	gsprintf(STDERR, " {common.specific_options}:\n");
303	}
304	else {
305	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
306	}
307
308	# Display the plugin options
309	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
310	}
311
312	# Recurse up the plugin hierarchy
313	$self->print_plugin_usage($descoffset, 0);
314	$self->{'option_list'} = \@optionlist;
315	}
316
317
318	sub new {
319	my $class = shift (@_);
320	my $plugin_name = shift (@_);
321	my $self = {};
322	$self->{'plugin_type'} = "BasPlug";
323
324	if (GISBasPlug::has_mapdata()) {
325	push(@$arguments,@$gis_arguments);
326	}
327
328	my $enc = "^(";
329	map {$enc .= "$_\|";} keys %$encodings::encodings;
330	my $denc = $enc . "ascii\|utf8\|unicode)\$";
331	$enc .= "ascii\|utf8\|unicode\|auto)\$";
332
333	$self->{'outhandle'} = STDERR;
334	my $year = (localtime)[5]+1900;
335
336	$self->{'textcat'} = new textcat();
337
338	$self->{'num_processed'} = 0;
339	$self->{'num_not_processed'} = 0;
340	$self->{'num_blocked'} = 0;
341	$self->{'num_archives'} = 0;
342	$self->{'cover_image'} = 1; # cover image is on by default
343
344	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
345	$self->{'option_list'} = [ $options ];
346
347	my $no_cover_image = 0;
348	# general options available to all plugins
349	if (!parsargv::parse(\@_,
350	q^process_exp/.*/^, \$self->{'process_exp'},
351	q^block_exp/.*/^, \$self->{'block_exp'},
352	q^associate_ext/.*/^, \$self->{'associate_ext'},
353	q^extract_language^, \$self->{'extract_language'},
354	q^extract_acronyms^, \$self->{'extract_acronyms'},
355	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
356	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
357	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
358	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
359	q^extract_email^, \$self->{'extract_email'},
360	q^extract_placenames^, \$self->{'extract_placenames'},
361	q^gazetteer/.*/^, \$self->{'gazetteer'},
362	q^place_list^, \$self->{'place_list'},
363	q^markup_acronyms^, \$self->{'markup_acronyms'},
364	q^default_language/.{2}/en^, \$self->{'default_language'},
365	q^first/.*/^, \$self->{'first'},
366	q^extract_historical_years^, \$self->{'date_extract'},
367	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
368	q^no_bibliography^, \$self->{'no_biblio'},
369	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
370	q^no_cover_image^, \$no_cover_image,
371	q^separate_cjk^, \$self->{'separate_cjk'},
372	q^smart_block^, \$self->{'smart_block'},
373	q^smart_block_BN^, \$self->{'smart_block_BN'},
374	"allow_extra_options")) {
375
376	gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
377	bless $self, $class;
378	$self->print_txt_usage(""); # Use default resource bundle
379	die "\n";
380	}
381
382	my $associate_ext = $self->{'associate_ext'};
383	if ((defined $associate_ext) && ($associate_ext ne "")) {
384	my @exts = split(/,/,$associate_ext);
385
386	my %associate_ext_lookup = ();
387	foreach my $e (@exts) {
388	$associate_ext_lookup{$e} = 1;
389	}
390
391	$self->{'associate_ext_lookup'} = \%associate_ext_lookup;
392	}
393
394	$self->{'shared_fileroot'} = {};
395	$self->{'file_blocks'} = {};
396
397	$self->{'cover_image'} = 0 if ($no_cover_image);
398
399	if ($self->{'extract_placenames'}) {
400
401	my $outhandle = $self->{'outhandle'};
402
403	my $places_ref
404	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
405
406	if (!defined $places_ref) {
407	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
408	print $outhandle " No placename extraction will take place.\n";
409	$self->{'extract_placenames'} = undef;
410	}
411	else {
412	$self->{'places'} = $places_ref;
413	}
414	}
415	return bless $self, $class;
416	}
417
418	# initialize BasPlug options
419	# if init() is overridden in a sub-class, remember to call BasPlug::init()
420	sub init {
421	my $self = shift (@_);
422	my ($verbosity, $outhandle, $failhandle) = @_;
423
424	# verbosity is passed through from the processor
425	$self->{'verbosity'} = $verbosity;
426
427	# as are the outhandle and failhandle
428	$self->{'outhandle'} = $outhandle if defined $outhandle;
429	$self->{'failhandle'} = $failhandle;
430
431	# set process_exp and block_exp to defaults unless they were
432	# explicitly set
433
434	if ((!$self->is_recursive()) and
435	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
436
437	$self->{'process_exp'} = $self->get_default_process_exp ();
438	if ($self->{'process_exp'} eq "") {
439	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
440	}
441	}
442
443	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
444	$self->{'block_exp'} = $self->get_default_block_exp ();
445	}
446	}
447
448	sub begin {
449	my $self = shift (@_);
450	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
451	$self->initialise_extractors();
452	}
453
454	sub end {
455	my ($self) = @_;
456	$self->finalise_extractors();
457	}
458
459	# this function should be overridden to return 1
460	# in recursive plugins
461	sub is_recursive {
462	my $self = shift (@_);
463
464	return 0;
465	}
466
467	sub get_default_block_exp {
468	my $self = shift (@_);
469
470	return "";
471	}
472
473	sub get_default_process_exp {
474	my $self = shift (@_);
475
476	return "";
477	}
478
479	# default implementation is to do nothing.
480	sub store_block_files
481	{
482	my $self =shift (@_);
483	my ($filename) = @_;
484	return;
485	}
486
487	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
488	sub block_cover_image
489	{
490	my $self =shift (@_);
491	my ($filename) = @_;
492	if ($self->{'cover_image'}) {
493	my $coverfile = $filename;
494	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
495	if (!-e $coverfile) {
496	$coverfile =~ s/jpg$/JPG/;
497	}
498	if (-e $coverfile) {
499	$self->{'file_blocks'}->{$coverfile} = 1;
500	}
501	}
502
503	return;
504	}
505
506	sub metadata_read {
507	my $self = shift (@_);
508	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
509	# Keep track of filenames with same root but different extensions
510	# Used to support -associate_ext
511
512	my $associate_ext = $self->{'associate_ext'};
513	if ((defined $associate_ext) && ($associate_ext ne "")) {
514
515	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
516	if ((defined $file_prefix) && (defined $file_ext)) {
517
518	my $shared_fileroot = $self->{'shared_fileroot'};
519	if (!defined $shared_fileroot->{$file_prefix}) {
520	my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
521	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
522	}
523
524	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
525
526	my $process_exp = $self->{'process_exp'};
527
528	if ($file =~ m/$self->{'process_exp'}/) {
529	# This is the document the others should be tied to
530	$file_prefix_rec->{'tie_to'} = $file_ext;
531	}
532	else {
533	if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
534	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
535	}
536	}
537	}
538	}
539
540	# now check whether we are actually processing this
541	my $filename = $file;
542	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
543	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
544	return undef; # can't recognise
545	}
546
547	# do smart blocking if appropriate
548	if (defined $self->{'smart_block'}) {
549	$self->block_cover_image($filename);
550	$self->store_block_files($filename);
551	}
552
553	return 1;
554	}
555
556	sub tie_to_filename
557	{
558	my $self = shift (@_);
559
560	my ($file_ext,$file_prefix_rec) = @_;
561
562	if (defined $file_prefix_rec) {
563	my $tie_to = $file_prefix_rec->{'tie_to'};
564
565	if (defined $tie_to) {
566	if ($tie_to eq $file_ext) {
567	return 1;
568	}
569	}
570	}
571
572	return 0;
573	}
574
575	sub tie_to_assoc_file
576	{
577	my $self = shift (@_);
578	my ($file_ext,$file_prefix_rec) = @_;
579
580	if (defined $file_prefix_rec) {
581	my $tie_to = $file_prefix_rec->{'tie_to'};
582	if (defined $tie_to) {
583
584	my $exts = $file_prefix_rec->{'exts'};
585
586	my $has_file_ext = $exts->{$file_ext};
587
588	if ($has_file_ext) {
589	return 1;
590	}
591	}
592	}
593
594	return 0;
595	}
596
597
598	sub associate_with
599	{
600	my $self = shift (@_);
601	my ($file, $filename, $metadata) = @_;
602
603	my $associate_ext = $self->{'associate_ext'};
604
605
606	return 0 if (!$associate_ext);
607
608	# If file, see if matches with "tie_to" doc or is one of the
609	# associated filename extensions.
610
611	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
612	if ((defined $file_prefix) && (defined $file_ext)) {
613
614	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
615
616	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
617
618	# Set up gsdlassocfile_tobe
619
620	my $exts = $file_prefix_rec->{'exts'};
621
622	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
623	$metadata->{'gsdlassocfile_tobe'} = [];
624	}
625
626	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
627
628	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
629	foreach my $e (keys %$exts) {
630	my $assoc_file = "$full_prefix.$e";
631	my $mime_type = ""; # let system auto detect this
632	push(@$assoc_tobe,"$assoc_file:$mime_type:");
633	}
634	}
635	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
636	# a form of smart block
637
638	return 1;
639	}
640	}
641
642	return 0;
643	}
644
645
646	# The BasPlug read() function. This function does all the right things
647	# to make general options work for a given plugin. It calls the process()
648	# function which does all the work specific to a plugin (like the old
649	# read functions used to do). Most plugins should define their own
650	# process() function and let this read() function keep control.
651	#
652	# recursive plugins (e.g. RecPlug) and specialized plugins like those
653	# capable of processing many documents within a single file (e.g.
654	# GMLPlug) should normally implement their own version of read()
655	#
656	# Return number of files processed, undef if can't recognise, -1 if can't
657	# process
658	# Note that $base_dir might be "" and that $file might
659	# include directories
660
661	sub read {
662	my $self = shift (@_);
663
664	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
665
666	if ($self->is_recursive()) {
667	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
668	}
669
670	my $outhandle = $self->{'outhandle'};
671	my $smart_block = $self->{'smart_block'};
672	my $smart_block_BN = $self->{'smart_block_BN'};
673
674	my $filename = $file;
675	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
676
677	if ($self->associate_with($file,$filename,$metadata)) {
678	# a form of smart block
679	$self->{'num_blocked'} ++;
680	return 0; # blocked
681	}
682
683	if ($smart_block \|\| $smart_block_BN) {
684	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
685	$self->{'num_blocked'} ++;
686	return 0; # blocked
687	}
688	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
689	$self->{'num_blocked'} ++;
690	return 0; # blocked
691	}
692
693	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
694	return undef; # can't recognise
695	}
696	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
697
698	# Do encoding stuff
699	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
700
701	# create a new document
702	my $doc_obj = new doc ($filename, "indexed_doc");
703	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
704	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
705	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
706	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
707	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
708
709	my ($filemeta) = $file =~ /([^\\\/]+)$/;
710	# how do we know what encoding the filename is in?
711	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
712	if ($self->{'cover_image'}) {
713	$self->associate_cover_image($doc_obj, $filename);
714	}
715
716	# read in file ($text will be in utf8)
717	my $text = "";
718	$self->read_file ($filename, $encoding, $language, \$text);
719
720	if (!length ($text)) {
721	my $plugin_name = ref ($self);
722	if ($gli) {
723	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
724	}
725	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
726
727	my $failhandle = $self->{'failhandle'};
728	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
729	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
730	$self->{'num_not_processed'} ++;
731
732	return 0; # what should we return here?? error but don't want to pass it on
733	}
734
735	# include any metadata passed in from previous plugins
736	# note that this metadata is associated with the top level section
737
738	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
739
740	# do plugin specific processing of doc_obj
741	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
742	$text = '';
743	undef $text;
744	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
745	return -1;
746	}
747	$text='';
748	undef $text;
749
750	# do any automatic metadata extraction
751	$self->auto_extract_metadata ($doc_obj);
752
753	# add an OID
754	# see if there is a plugin-specific set_OID function...
755	if (defined ($self->can('set_OID'))) {
756	# it will need $doc_obj to set the Identifier metadata...
757	$self->set_OID($doc_obj);
758	} else {
759	# use the default set_OID() in doc.pm
760	$doc_obj->set_OID();
761	}
762
763	# process the document
764	$processor->process($doc_obj);
765
766	if(defined($self->{'places_filename'})){
767	&util::rm($self->{'places_filename'});
768	$self->{'places_filename'} = undef;
769	}
770
771	$self->{'num_processed'} ++;
772	undef $doc_obj;
773	return 1; # processed the file
774	}
775
776	# returns undef if file is rejected by the plugin
777	sub process {
778	my $self = shift (@_);
779	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
780
781	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
782	# die "Basplug::process function must be implemented in sub-class\n";
783
784	return undef; # never gets here
785	}
786
787	# uses the multiread package to read in the entire file pointed to
788	# by filename and loads the resulting text into $$textref. Input text
789	# may be in any of the encodings handled by multiread, output text
790	# will be in utf8
791	sub read_file {
792	my $self = shift (@_);
793	my ($filename, $encoding, $language, $textref) = @_;
794
795	if (!-r $filename)
796	{
797	my $outhandle = $self->{'outhandle'};
798	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
799	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
800	return;
801	}
802
803	$$textref = "";
804
805	if (!open (FILE, $filename)) {
806	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
807	die "\n";
808	}
809
810	if ($encoding eq "ascii") {
811	undef $/;
812	$$textref = <FILE>;
813	$/ = "\n";
814	} else {
815	my $reader = new multiread();
816	$reader->set_handle ('BasPlug::FILE');
817	$reader->set_encoding ($encoding);
818	$reader->read_file ($textref);
819
820	#Now segments chinese if the separate_cjk option is set
821	if ($self->{'separate_cjk'}) {
822	# segment the Chinese words
823	$$textref = &cnseg::segment($$textref);
824	}
825	}
826
827	close FILE;
828	}
829
830	sub filename_based_title
831	{
832	my $self = shift (@_);
833	my ($file) = @_;
834
835	my $file_derived_title = $file;
836	$file_derived_title =~ s/_/ /g;
837	$file_derived_title =~ s/\..*?$//;
838
839	return $file_derived_title;
840	}
841
842
843	sub title_fallback
844	{
845	my $self = shift (@_);
846	my ($doc_obj,$section,$file) = @_;
847
848	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
849
850	my $file_derived_title = $self->filename_based_title($file);
851	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
852	}
853	}
854
855	sub textcat_get_language_encoding {
856	my $self = shift (@_);
857	my ($filename) = @_;
858
859	my ($language, $encoding, $extracted_encoding);
860	if ($self->{'input_encoding'} eq "auto") {
861	# use textcat to automatically work out the input encoding and language
862	($language, $encoding) = $self->get_language_encoding ($filename);
863	} elsif ($self->{'extract_language'}) {
864	# use textcat to get language metadata
865	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
866	$encoding = $self->{'input_encoding'};
867	# don't print this message for english... english in utf8 is identical
868	# to english in iso-8859-1 (except for some punctuation). We don't have
869	# a language model for en_utf8, so textcat always says iso-8859-1!
870	if ($extracted_encoding ne $encoding && $language ne "en"
871	&& $self->{'verbosity'}) {
872	my $plugin_name = ref ($self);
873	my $outhandle = $self->{'outhandle'};
874	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
875	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
876	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
877	}
878	} else {
879	$language = $self->{'default_language'};
880	$encoding = $self->{'input_encoding'};
881	}
882	return ($language, $encoding);
883	}
884
885	# Uses textcat to work out the encoding and language of the text in
886	# $filename. All html tags are removed before processing.
887	# returns an array containing "language" and "encoding"
888	sub get_language_encoding {
889	my $self = shift (@_);
890	my ($filename) = @_;
891	my $outhandle = $self->{'outhandle'};
892	my $unicode_format = "";
893	# read in file
894	open (FILE, $filename) \|\| (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
895	undef $/;
896	my $text = <FILE>;
897	$/ = "\n";
898	close FILE;
899
900	# check if first few bytes have a Byte Order Marker
901	my $bom=substr($text,0,2); # check 16bit unicode
902	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
903	$unicode_format="unicode";
904	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
905	$unicode_format="unicode";
906	} else {
907	$bom=substr($text,0,3); # check utf-8
908	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
909	$unicode_format="utf8";
910	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
911	# $unicode_format="utf8";
912	}
913	}
914
915
916	# remove <title>stuff</title> -- as titles tend often to be in English
917	# for foreign language documents
918	$text =~ s/<title>(.\|\n)*?<\/title>//i;
919
920	# remove all HTML tags
921	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
922	if (ref($self) eq 'HTMLPlug' \|\|
923	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
924	$text =~ s/<[^>]*>//sg;
925	}
926
927	# get the language/encoding
928	my $results = $self->{'textcat'}->classify(\$text);
929
930	# if textcat returns 3 or less possibilities we'll use the
931	# first one in the list - otherwise use the defaults
932	if (scalar @$results > 3) {
933	my $best_encoding="";
934	if ($unicode_format) { # in case the first had a BOM
935	$best_encoding=$unicode_format;
936	} else {
937	my %guessed_encodings = ();
938	foreach my $result (@$results) {
939	$result =~ /([^\-]+)$/;
940	my $enc=$1;
941	if (!defined($guessed_encodings{$enc})) {
942	$guessed_encodings{$enc}=0;
943	}
944	$guessed_encodings{$enc}++;
945	}
946
947	$guessed_encodings{""}=-1; # for default best_encoding of ""
948	foreach my $enc (keys %guessed_encodings) {
949	if ($guessed_encodings{$enc} >
950	$guessed_encodings{$best_encoding}){
951	$best_encoding=$enc;
952	}
953	}
954	}
955
956	if ($self->{'input_encoding'} ne 'auto') {
957	if ($self->{'extract_language'} && $self->{'verbosity'}) {
958	gsprintf($outhandle,
959	"BasPlug: {BasPlug.could_not_extract_language}\n",
960	$filename, $self->{'default_language'});
961	}
962	return ($self->{'default_language'}, $self->{'input_encoding'});
963
964	} else {
965	if ($self->{'verbosity'}) {
966	gsprintf($outhandle,
967	"BasPlug: {BasPlug.could_not_extract_language}\n",
968	$filename, $self->{'default_language'});
969	}
970	return ($self->{'default_language'}, $best_encoding);
971	}
972	}
973
974	# format language/encoding
975	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
976	if (!defined $language) {
977	if ($self->{'verbosity'}) {
978	gsprintf($outhandle,
979	"BasPlug: {BasPlug.could_not_extract_language}\n",
980	$filename, $self->{'default_language'});
981	}
982	$language = $self->{'default_language'};
983	}
984	if (!defined $encoding) {
985	if ($self->{'verbosity'}) {
986	gsprintf($outhandle,
987	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
988	$filename, $self->{'default_encoding'});
989	}
990	$encoding = $self->{'default_encoding'};
991	}
992
993
994	# check for equivalents where textcat doesn't have some encodings...
995	# eg MS versions of standard encodings
996	if ($encoding =~ /^iso_8859_(\d+)/) {
997	my $iso = $1; # which variant of the iso standard?
998	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
999	if ($text =~ /[\x80-\x9f]/) {
1000	# Western Europe
1001	if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1002	elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1003	elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1004	elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1005	elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1006	elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1007	elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1008	}
1009	}
1010
1011	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1012	!defined $encodings::encodings->{$encoding}) {
1013	if ($self->{'verbosity'}) {
1014	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1015	$filename, $encoding, $self->{'default_encoding'});
1016	}
1017	$encoding = $self->{'default_encoding'};
1018	}
1019
1020	return ($language, $encoding);
1021	}
1022
1023	# add any extra metadata that's been passed around from one
1024	# plugin to another.
1025	# extra_metadata uses add_utf8_metadata so it expects metadata values
1026	# to already be in utf8
1027	sub extra_metadata {
1028	my $self = shift (@_);
1029	my ($doc_obj, $cursection, $metadata) = @_;
1030
1031	foreach my $field (keys(%$metadata)) {
1032	# $metadata->{$field} may be an array reference
1033	if ($field eq "gsdlassocfile_tobe") {
1034	# 'gsdlassocfile_tobe' is artificially introduced metadata
1035	# that is used to signal that certain additional files should
1036	# be tied to this document. Useful in situations where a
1037	# metadata pass in the plugin pipeline works out some files
1038	# need to be associated with a document, but the document hasn't
1039	# been formed yet.
1040
1041	my $equiv_form = "";
1042	foreach my $gaf (@{$metadata->{$field}}) {
1043	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1044	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1045	my $filename = $full_filename;
1046
1047	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1048
1049	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1050	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1051	my $srcicon = "_icon".$doc_ext."_";
1052	my $end_doclink = "</a>";
1053
1054	$equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1055	}
1056	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1057	}
1058	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1059	map {
1060	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1061	} @{$metadata->{$field}};
1062	} else {
1063	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1064	}
1065	}
1066	}
1067
1068	# initialise metadata extractors
1069	sub initialise_extractors {
1070	my $self = shift (@_);
1071
1072	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1073	&acronym::initialise_acronyms();
1074	}
1075	}
1076
1077	# finalise metadata extractors
1078	sub finalise_extractors {
1079	my $self = shift (@_);
1080
1081	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1082	&acronym::finalise_acronyms();
1083	}
1084	}
1085
1086	# FIRSTNNN: extract the first NNN characters as metadata
1087	sub extract_first_NNNN_characters {
1088	my $self = shift (@_);
1089	my ($textref, $doc_obj, $thissection) = @_;
1090
1091	foreach my $size (split /,/, $self->{'first'}) {
1092	my $tmptext = $$textref;
1093	$tmptext =~ s/^\s+//;
1094	$tmptext =~ s/\s+$//;
1095	$tmptext =~ s/\s+/ /gs;
1096	$tmptext = substr ($tmptext, 0, $size);
1097	$tmptext =~ s/\s\S*$/…/;
1098	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1099	}
1100	}
1101
1102	sub extract_email {
1103	my $self = shift (@_);
1104	my ($textref, $doc_obj, $thissection) = @_;
1105	my $outhandle = $self->{'outhandle'};
1106
1107	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1108	if ($self->{'verbosity'} > 2);
1109
1110	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1111	@email = sort @email;
1112
1113	my @email2 = ();
1114	foreach my $address (@email) {
1115	if (!(join(" ",@email2) =~ m/$address/ )) {
1116	push @email2, $address;
1117	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1118	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1119	if ($self->{'verbosity'} > 3);
1120	}
1121	}
1122	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1123	if ($self->{'verbosity'} > 2);
1124	}
1125
1126	# extract metadata
1127	sub auto_extract_metadata {
1128
1129	my $self = shift (@_);
1130	my ($doc_obj) = @_;
1131
1132	if ($self->{'extract_email'}) {
1133	my $thissection = $doc_obj->get_top_section();
1134	while (defined $thissection) {
1135	my $text = $doc_obj->get_text($thissection);
1136	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1137	$thissection = $doc_obj->get_next_section ($thissection);
1138	}
1139	}
1140	if ($self->{'extract_placenames'}) {
1141	my $thissection = $doc_obj->get_top_section();
1142	while (defined $thissection) {
1143	my $text = $doc_obj->get_text($thissection);
1144	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1145	$thissection = $doc_obj->get_next_section ($thissection);
1146	}
1147	}
1148
1149	# adding kea keyphrases
1150	if ($self->{'kea'}) {
1151
1152	my $thissection = $doc_obj->get_top_section();
1153	my $text = "";
1154	my $list;
1155
1156	#loop through sections to gather whole doc
1157	while (defined $thissection) {
1158	my $sectiontext = $doc_obj->get_text($thissection);
1159	$text = $text.$sectiontext;
1160	$thissection = $doc_obj->get_next_section ($thissection);
1161	}
1162
1163	if ($self->{'kea_options'}) {
1164	#if kea options flag is set, call Kea with specified options
1165	$list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
1166	} else {
1167	#otherwise call Kea with no options
1168	$list = &Kea::extract_KeyPhrases ($text);
1169	}
1170	if ($list){
1171	# if a list of kea keyphrases was returned (ie not empty)
1172	if ($self->{'verbosity'}) {
1173	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1174	}
1175
1176	#add metadata to top section
1177	$thissection = $doc_obj->get_top_section();
1178
1179	# add all key phrases as one metadata
1180	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1181
1182	# add individual key phrases as multiple metadata
1183	foreach my $keyphrase (split(',', $list)) {
1184	$keyphrase =~ s/^\s+\|\s+$//g;
1185	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1186	}
1187	}
1188	} #end of kea
1189
1190	if ($self->{'first'}) {
1191	my $thissection = $doc_obj->get_top_section();
1192	while (defined $thissection) {
1193	my $text = $doc_obj->get_text($thissection);
1194	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1195	$thissection = $doc_obj->get_next_section ($thissection);
1196	}
1197	}
1198
1199	if ($self->{'extract_acronyms'}) {
1200	my $thissection = $doc_obj->get_top_section();
1201	while (defined $thissection) {
1202	my $text = $doc_obj->get_text($thissection);
1203	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1204	$thissection = $doc_obj->get_next_section ($thissection);
1205	}
1206	}
1207
1208	if ($self->{'markup_acronyms'}) {
1209	my $thissection = $doc_obj->get_top_section();
1210	while (defined $thissection) {
1211	my $text = $doc_obj->get_text($thissection);
1212	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1213	$doc_obj->delete_text($thissection);
1214	$doc_obj->add_text($thissection, $text);
1215	$thissection = $doc_obj->get_next_section ($thissection);
1216	}
1217	}
1218
1219	if($self->{'date_extract'}) {
1220	my $thissection = $doc_obj->get_top_section();
1221	while (defined $thissection) {
1222
1223	my $text = $doc_obj->get_text($thissection);
1224	&DateExtract::get_date_metadata($text, $doc_obj,
1225	$thissection,
1226	$self->{'no_biblio'},
1227	$self->{'max_year'},
1228	$self->{'max_century'});
1229	$thissection = $doc_obj->get_next_section ($thissection);
1230	}
1231	}
1232	}
1233
1234	# extract acronyms from a section in a document. progress is
1235	# reported to outhandle based on the verbosity. both the Acronym
1236	# and the AcronymKWIC metadata items are created.
1237
1238	sub extract_acronyms {
1239	my $self = shift (@_);
1240	my ($textref, $doc_obj, $thissection) = @_;
1241	my $outhandle = $self->{'outhandle'};
1242
1243	# print $outhandle " extracting acronyms ...\n"
1244	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1245	if ($self->{'verbosity'} > 2);
1246
1247	my $acro_array = &acronym::acronyms($textref);
1248
1249	foreach my $acro (@$acro_array) {
1250
1251	#check that this is the first time ...
1252	my $seen_before = "false";
1253	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1254	foreach my $thisAcro (@$previous_data) {
1255	if ($thisAcro eq $acro->to_string()) {
1256	$seen_before = "true";
1257	if ($self->{'verbosity'} >= 4) {
1258	gsprintf($outhandle, " {BasPlug.already_seen} " .
1259	$acro->to_string() . "\n");
1260	}
1261	}
1262	}
1263
1264	if ($seen_before eq "false") {
1265	#write it to the file ...
1266	$acro->write_to_file();
1267
1268	#do the normal acronym
1269	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1270	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1271	if ($self->{'verbosity'} > 3);
1272	}
1273	}
1274
1275	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1276	if ($self->{'verbosity'} > 2);
1277	}
1278
1279	sub markup_acronyms {
1280	my $self = shift (@_);
1281	my ($text, $doc_obj, $thissection) = @_;
1282	my $outhandle = $self->{'outhandle'};
1283
1284	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1285	if ($self->{'verbosity'} > 2);
1286
1287	#self is passed in to check for verbosity ...
1288	$text = &acronym::markup_acronyms($text, $self);
1289
1290	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1291	if ($self->{'verbosity'} > 2);
1292
1293	return $text;
1294	}
1295
1296	sub compile_stats {
1297	my $self = shift(@_);
1298	my ($stats) = @_;
1299
1300	$stats->{'num_processed'} += $self->{'num_processed'};
1301	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1302	$stats->{'num_archives'} += $self->{'num_archives'};
1303
1304	}
1305
1306	sub associate_cover_image {
1307	my $self = shift(@_);
1308	my ($doc_obj, $filename) = @_;
1309
1310	my $top_section=$doc_obj->get_top_section();
1311
1312	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1313	if (-e $filename) {
1314	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1315	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1316	} else {
1317	$filename =~ s/jpg$/JPG/;
1318	if (-e $filename) {
1319	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1320	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1321	}
1322	}
1323	}
1324
1325	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: