Context Navigation

source: gsdl/trunk/perllib/plugins/BasPlug.pm@ 14961

Last change on this file since 14961 was 14961, checked in by davidb, 16 years ago
Setting filename metadata (Source) in BasPlug.pm looks to user its locale, in the first instance, to resolve what character encoding the file system uses.
Property svn:keywords set to `Author Date Id Revision`
File size: 48.6 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39	no strict 'subs';
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	use File::Basename;
43
44	use Kea;
45	use multiread;
46	use encodings;
47	use unicode;
48	use cnseg;
49	use acronym;
50	use textcat;
51	use doc;
52	eval "require diagnostics"; # some perl distros (eg mac) don't have this
53	use DateExtract;
54	use ghtml;
55	use gsprintf 'gsprintf';
56	use printusage;
57	use parse2;
58
59
60	use GISBasPlug;
61
62	@BasPlug::ISA = ( GISBasPlug );
63
64	my $unicode_list =
65	[ { 'name' => "ascii",
66	'desc' => "{BasPlug.input_encoding.ascii}" },
67	{ 'name' => "utf8",
68	'desc' => "{BasPlug.input_encoding.utf8}" },
69	{ 'name' => "unicode",
70	'desc' => "{BasPlug.input_encoding.unicode}" } ];
71
72	my $auto_unicode_list =
73	[ { 'name' => "auto",
74	'desc' => "{BasPlug.input_encoding.auto}" } ];
75
76	my $e = $encodings::encodings;
77	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78	{
79	my $hashEncode =
80	{'name' => $enc,
81	'desc' => $e->{$enc}->{'name'}};
82
83	push(@{$unicode_list},$hashEncode);
84	}
85
86	push(@{$auto_unicode_list},@{$unicode_list});
87
88	my $arguments =
89	[ { 'name' => "process_exp",
90	'desc' => "{BasPlug.process_exp}",
91	'type' => "regexp",
92	'deft' => "",
93	'reqd' => "no" },
94	{ 'name' => "block_exp",
95	'desc' => "{BasPlug.block_exp}",
96	'type' => "regexp",
97	'deft' => "",
98	'reqd' => "no" },
99	{ 'name' => "smart_block",
100	'desc' => "{BasPlug.smart_block}",
101	'type' => "flag",
102	'reqd' => "no" },
103	{ 'name' => "associate_ext",
104	'desc' => "{BasPlug.associate_ext}",
105	'type' => "string",
106	'reqd' => "no" },
107	{ 'name' => "associate_tail_re",
108	'desc' => "{BasPlug.associate_tail_re}",
109	'type' => "string",
110	'reqd' => "no" },
111	{ 'name' => "input_encoding",
112	'desc' => "{BasPlug.input_encoding}",
113	'type' => "enum",
114	'list' => $auto_unicode_list,
115	'reqd' => "no" ,
116	'deft' => "auto" } ,
117	{ 'name' => "default_encoding",
118	'desc' => "{BasPlug.default_encoding}",
119	'type' => "enum",
120	'list' => $unicode_list,
121	'reqd' => "no",
122	'deft' => "utf8" },
123	{ 'name' => "extract_language",
124	'desc' => "{BasPlug.extract_language}",
125	'type' => "flag",
126	'reqd' => "no" },
127	{ 'name' => "default_language",
128	'desc' => "{BasPlug.default_language}",
129	'type' => "string",
130	'deft' => "en",
131	'reqd' => "no" },
132	{ 'name' => "extract_acronyms",
133	'desc' => "{BasPlug.extract_acronyms}",
134	'type' => "flag",
135	'reqd' => "no" },
136	{ 'name' => "markup_acronyms",
137	'desc' => "{BasPlug.markup_acronyms}",
138	'type' => "flag",
139	'reqd' => "no" },
140	{ 'name' => "extract_keyphrases",
141	'desc' => "{BasPlug.extract_keyphrases}",
142	'type' => "flag",
143	'reqd' => "no" },
144	{ 'name' => "extract_keyphrases_kea4",
145	'desc' => "{BasPlug.extract_keyphrases_kea4}",
146	'type' => "flag",
147	'reqd' => "no" },
148	{ 'name' => "extract_keyphrase_options",
149	'desc' => "{BasPlug.extract_keyphrase_options}",
150	'type' => "string",
151	'deft' => "",
152	'reqd' => "no" },
153	{ 'name' => "first",
154	'desc' => "{BasPlug.first}",
155	'type' => "string",
156	'reqd' => "no" },
157	{ 'name' => "extract_email",
158	'desc' => "{BasPlug.extract_email}",
159	'type' => "flag",
160	'reqd' => "no" },
161	{ 'name' => "extract_historical_years",
162	'desc' => "{BasPlug.extract_historical_years}",
163	'type' => "flag",
164	'reqd' => "no" },
165	{ 'name' => "maximum_year",
166	'desc' => "{BasPlug.maximum_year}",
167	'type' => "int",
168	'deft' => (localtime)[5]+1900,
169	'char_length' => "4",
170	#'range' => "2,100",
171	'reqd' => "no"},
172	{ 'name' => "maximum_century",
173	'desc' => "{BasPlug.maximum_century}",
174	'type' => "string",
175	'deft' => "-1",
176	'reqd' => "no" },
177	{ 'name' => "no_bibliography",
178	'desc' => "{BasPlug.no_bibliography}",
179	'type' => "flag",
180	'reqd' => "no"},
181	{ 'name' => "no_cover_image",
182	'desc' => "{BasPlug.no_cover_image}",
183	'type' => "flag",
184	'reqd' => "no" },
185	{ 'name' => "separate_cjk",
186	'desc' => "{BasPlug.separate_cjk}",
187	'type' => "flag",
188	'reqd' => "no",
189	'hiddengli' => "yes" },
190	{ 'name' => "new_extract_email",
191	'desc' => "",
192	'type' => "flag",
193	'reqd' => "no",
194	'hiddengli' => "yes" } ];
195
196	my $gis_arguments =
197	[ { 'name' => "extract_placenames",
198	'desc' => "{GISBasPlug.extract_placenames}",
199	'type' => "flag",
200	'reqd' => "no" },
201	{ 'name' => "gazetteer",
202	'desc' => "{GISBasPlug.gazetteer}",
203	'type' => "string",
204	'reqd' => "no" },
205	{ 'name' => "place_list",
206	'desc' => "{GISBasPlug.place_list}",
207	'type' => "flag",
208	'reqd' => "no" } ];
209
210
211	my $options = { 'name' => "BasPlug",
212	'desc' => "{BasPlug.desc}",
213	'abstract' => "yes",
214	'inherits' => "no",
215	'args' => $arguments };
216
217
218	sub set_incremental {
219	my $self = shift(@_);
220	my ($incremental) = @_;
221
222	$self->{'incremental'} = $incremental;
223	}
224
225	sub get_arguments
226	{
227	my $self = shift(@_);
228	my $optionlistref = $self->{'option_list'};
229	my @optionlist = @$optionlistref;
230	my $pluginoptions = pop(@$optionlistref);
231	my $pluginarguments = $pluginoptions->{'args'};
232	return $pluginarguments;
233	}
234
235
236	sub print_xml_usage
237	{
238	my $self = shift(@_);
239	my $header = shift(@_);
240	my $high_level_information_only = shift(@_);
241
242	# XML output is always in UTF-8
243	gsprintf::output_strings_in_UTF8;
244
245	if ($header) {
246	&PrintUsage::print_xml_header("plugin");
247	}
248	$self->print_xml($high_level_information_only);
249	}
250
251
252	sub print_xml
253	{
254	my $self = shift(@_);
255	my $high_level_information_only = shift(@_);
256
257	my $optionlistref = $self->{'option_list'};
258	my @optionlist = @$optionlistref;
259	my $pluginoptions = shift(@$optionlistref);
260	return if (!defined($pluginoptions));
261
262	# Find the process and block default expressions in the plugin arguments
263	my $process_exp = "";
264	my $block_exp = "";
265	if (defined($pluginoptions->{'args'})) {
266	foreach my $option (@{$pluginoptions->{'args'}}) {
267	if ($option->{'name'} eq "process_exp") {
268	$process_exp = $option->{'deft'};
269	}
270	if ($option->{'name'} eq "block_exp") {
271	$block_exp = $option->{'deft'};
272	}
273	}
274	}
275
276	gsprintf(STDERR, "<PlugInfo>\n");
277	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
278	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
279	$desc =~ s/</&lt;/g; # doubly escaped
280	$desc =~ s/>/&gt;/g;
281	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
282	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
283	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
284	gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");
285	gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");
286	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
287	unless (defined($high_level_information_only)) {
288	gsprintf(STDERR, " <Arguments>\n");
289	if (defined($pluginoptions->{'args'})) {
290	&PrintUsage::print_options_xml($pluginoptions->{'args'});
291	}
292	gsprintf(STDERR, " </Arguments>\n");
293
294	# Recurse up the plugin hierarchy
295	$self->print_xml();
296	}
297	gsprintf(STDERR, "</PlugInfo>\n");
298	}
299
300
301	sub print_txt_usage
302	{
303	my $self = shift(@_);
304	# Print the usage message for a plugin (recursively)
305	my $descoffset = $self->determine_description_offset(0);
306	$self->print_plugin_usage($descoffset, 1);
307	}
308
309
310	sub determine_description_offset
311	{
312	my $self = shift(@_);
313	my $maxoffset = shift(@_);
314
315	my $optionlistref = $self->{'option_list'};
316	my @optionlist = @$optionlistref;
317	my $pluginoptions = shift(@$optionlistref);
318	return $maxoffset if (!defined($pluginoptions));
319
320	# Find the length of the longest option string of this plugin
321	my $pluginargs = $pluginoptions->{'args'};
322	if (defined($pluginargs)) {
323	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
324	if ($longest > $maxoffset) {
325	$maxoffset = $longest;
326	}
327	}
328
329	# Recurse up the plugin hierarchy
330	$maxoffset = $self->determine_description_offset($maxoffset);
331	$self->{'option_list'} = \@optionlist;
332	return $maxoffset;
333	}
334
335
336	sub print_plugin_usage
337	{
338	my $self = shift(@_);
339	my $descoffset = shift(@_);
340	my $isleafclass = shift(@_);
341
342	my $optionlistref = $self->{'option_list'};
343	my @optionlist = @$optionlistref;
344	my $pluginoptions = shift(@$optionlistref);
345	return if (!defined($pluginoptions));
346
347	my $pluginname = $pluginoptions->{'name'};
348	my $pluginargs = $pluginoptions->{'args'};
349	my $plugindesc = $pluginoptions->{'desc'};
350
351	# Produce the usage information using the data structure above
352	if ($isleafclass) {
353	if (defined($plugindesc)) {
354	gsprintf(STDERR, "$plugindesc\n\n");
355	}
356	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
357	}
358
359	# Display the plugin options, if there are some
360	if (defined($pluginargs)) {
361	# Calculate the column offset of the option descriptions
362	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
363
364	if ($isleafclass) {
365	gsprintf(STDERR, " {common.specific_options}:\n");
366	}
367	else {
368	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
369	}
370
371	# Display the plugin options
372	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
373	}
374
375	# Recurse up the plugin hierarchy
376	$self->print_plugin_usage($descoffset, 0);
377	$self->{'option_list'} = \@optionlist;
378	}
379
380
381	sub new {
382	# Set Encodings to the list!!
383
384
385	# Start the BasPlug Constructor
386	my $class = shift (@_);
387	my ($pluginlist,$args,$hashArgOptLists) = @_;
388	push(@$pluginlist, $class);
389	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
390
391	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
392	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
393
394	if (GISBasPlug::has_mapdata()) {
395	push(@$arguments,@$gis_arguments);
396	}
397
398	my $self = {};
399	$self->{'outhandle'} = STDERR;
400	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
401	$self->{"info_only"} = 0;
402
403	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
404	# the args, just return the object.
405	foreach my $strArg (@{$args})
406	{
407	if($strArg eq "-gsdlinfo")
408	{
409	$self->{"info_only"} = 1;
410	return bless $self, $class;
411	}
412	}
413
414	if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
415	{
416	my $classTempClass = bless $self, $class;
417	print STDERR "<BadPlugin p=$plugin_name>\n";
418	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
419	$classTempClass->print_txt_usage(""); # Use default resource bundle
420	die "\n";
421	}
422
423
424	delete $self->{"info_only"};
425	# else parsing was successful.
426
427	$self->{'plugin_type'} = $plugin_name;
428	#$self->{'outhandle'} = STDERR;
429	$self->{'num_processed'} = 0;
430	$self->{'num_not_processed'} = 0;
431	$self->{'num_blocked'} = 0;
432	$self->{'num_archives'} = 0;
433	$self->{'cover_image'} = 1; # cover image is on by default
434	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
435	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
436
437	my $associate_ext = $self->{'associate_ext'};
438	if ((defined $associate_ext) && ($associate_ext ne "")) {
439
440	my $associate_tail_re = $self->{'associate_tail_re'};
441	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
442	my $outhandle = $self->{'outhandle'};
443	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
444	print $outhandle " defaulting to 'associate_tail_re'\n";
445	}
446	else {
447	my @exts = split(/,/,$associate_ext);
448
449	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
450	my $associate_tail_re = join("\|",@exts_bracketed);
451	$self->{'associate_tail_re'} = $associate_tail_re;
452	}
453
454	delete $self->{'associate_ext'};
455	}
456
457	$self->{'shared_fileroot'} = {};
458	$self->{'file_blocks'} = {};
459
460	if ($self->{'extract_placenames'}) {
461
462	my $outhandle = $self->{'outhandle'};
463
464	my $places_ref
465	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
466
467	if (!defined $places_ref) {
468	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
469	print $outhandle " No placename extraction will take place.\n";
470	$self->{'extract_placenames'} = undef;
471	}
472	else {
473	$self->{'places'} = $places_ref;
474	}
475	}
476
477	return bless $self, $class;
478
479	}
480
481	# initialize BasPlug options
482	# if init() is overridden in a sub-class, remember to call BasPlug::init()
483	sub init {
484	my $self = shift (@_);
485	my ($verbosity, $outhandle, $failhandle) = @_;
486
487	# verbosity is passed through from the processor
488	$self->{'verbosity'} = $verbosity;
489
490	# as are the outhandle and failhandle
491	$self->{'outhandle'} = $outhandle if defined $outhandle;
492	$self->{'failhandle'} = $failhandle;
493
494	# set process_exp and block_exp to defaults unless they were
495	# explicitly set
496
497	if ((!$self->is_recursive()) and
498	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
499
500	$self->{'process_exp'} = $self->get_default_process_exp ();
501	if ($self->{'process_exp'} eq "") {
502	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
503	}
504	}
505
506	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
507	$self->{'block_exp'} = $self->get_default_block_exp ();
508	}
509
510	}
511
512	sub begin {
513	my $self = shift (@_);
514	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
515	$self->initialise_extractors();
516	}
517
518	sub end {
519	# potentially called at the end of each plugin pass
520	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
521
522	my ($self) = @_;
523	$self->finalise_extractors();
524	}
525
526	sub deinit {
527	# called only once, after all plugin passes have been done
528
529	my ($self) = @_;
530	}
531
532	# this function should be overridden to return 1
533	# in recursive plugins
534	sub is_recursive {
535	my $self = shift (@_);
536
537	return 0;
538	}
539
540	sub get_default_block_exp {
541	my $self = shift (@_);
542
543	return "";
544	}
545
546	sub get_default_process_exp {
547	my $self = shift (@_);
548
549	return "";
550	}
551
552	# default implementation is to do nothing.
553	sub store_block_files
554	{
555	my $self =shift (@_);
556	my ($filename) = @_;
557	return;
558	}
559
560	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
561	sub block_cover_image
562	{
563	my $self =shift;
564	my $filename = shift;
565
566	if ($self->{'cover_image'}) {
567	my $coverfile = $filename;
568	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
569	if (!-e $coverfile) {
570	$coverfile =~ s/jpg$/JPG/;
571	}
572	if (-e $coverfile) {
573	$self->{'file_blocks'}->{$coverfile} = 1;
574	}
575	}
576
577	return;
578	}
579
580	sub root_ext_split
581	{
582	my $self = shift (@_);
583	my ($filename,$tail_re) = @_;
584
585	my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
586
587	if ((!defined $file_prefix) \|\| (!defined $file_ext)) {
588	($file_prefix,$file_ext) = ($filename =~ m/^(.)(\..?)$/);
589	}
590
591	return ($file_prefix,$file_ext);
592	}
593
594	sub metadata_read {
595	my $self = shift (@_);
596	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
597	# Keep track of filenames with same root but different extensions
598	# Used to support -associate_ext and the more generalised
599	# -associate_tail_re
600
601	my $associate_tail_re = $self->{'associate_tail_re'};
602	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
603
604	my ($file_prefix,$file_ext)
605	= $self->root_ext_split($file,$associate_tail_re);
606
607	if ((defined $file_prefix) && (defined $file_ext)) {
608
609	my $shared_fileroot = $self->{'shared_fileroot'};
610	if (!defined $shared_fileroot->{$file_prefix}) {
611	my $file_prefix_rec = { 'tie_to' => undef,
612	'exts' => {} };
613	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
614	}
615
616	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
617
618	my $process_exp = $self->{'process_exp'};
619
620	if ($file =~ m/$process_exp/) {
621	# This is the document the others should be tied to
622	$file_prefix_rec->{'tie_to'} = $file_ext;
623	}
624	else {
625	if ($file_ext =~ m/$associate_tail_re$/) {
626	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
627	}
628	}
629
630	}
631	}
632
633	# now check whether we are actually processing this
634	my $filename = $file;
635	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
636	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
637	return undef; # can't recognise
638	}
639
640	# do smart blocking if appropriate
641	if ($self->{'smart_block'}) {
642	$self->store_block_files($filename);
643	}
644	# block the cover image if there is one
645	if ($self->{'cover_image'}) {
646	$self->block_cover_image($filename);
647	}
648
649	return 1;
650	}
651
652	sub tie_to_filename
653	{
654	my $self = shift (@_);
655
656	my ($file_ext,$file_prefix_rec) = @_;
657
658	if (defined $file_prefix_rec) {
659	my $tie_to = $file_prefix_rec->{'tie_to'};
660
661	if (defined $tie_to) {
662	if ($tie_to eq $file_ext) {
663	return 1;
664	}
665	}
666	}
667
668	return 0;
669	}
670
671	sub tie_to_assoc_file
672	{
673	my $self = shift (@_);
674	my ($file_ext,$file_prefix_rec) = @_;
675
676	if (defined $file_prefix_rec) {
677	my $tie_to = $file_prefix_rec->{'tie_to'};
678	if (defined $tie_to) {
679
680	my $exts = $file_prefix_rec->{'exts'};
681
682	my $has_file_ext = $exts->{$file_ext};
683
684	if ($has_file_ext) {
685	return 1;
686	}
687	}
688	}
689
690	return 0;
691	}
692
693
694	sub associate_with
695	{
696	my $self = shift (@_);
697	my ($file, $filename, $metadata) = @_;
698
699	my $associate_tail_re = $self->{'associate_tail_re'};
700	return 0 if (!$associate_tail_re);
701
702	# If file, see if matches with "tie_to" doc or is one of the
703	# associated filename extensions.
704
705	my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
706
707	if ((defined $file_prefix) && (defined $file_ext)) {
708
709	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
710
711	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
712
713	# Set up gsdlassocfile_tobe
714
715	my $exts = $file_prefix_rec->{'exts'};
716
717	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
718	$metadata->{'gsdlassocfile_tobe'} = [];
719	}
720
721	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
722
723	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
724	foreach my $e (keys %$exts) {
725	my $assoc_file = "$full_prefix$e";
726	print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
727	my $mime_type = ""; # let system auto detect this
728	push(@$assoc_tobe,"$assoc_file:$mime_type:");
729	}
730
731	}
732	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
733
734
735	# a form of smart block
736	return 1;
737	}
738	}
739
740	return 0;
741	}
742
743
744	sub read_block {
745	my $self = shift (@_);
746
747	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
748
749
750	my $filename = $file;
751	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
752
753	if ($self->associate_with($file,$filename,$metadata)) {
754	# a form of smart block
755	$self->{'num_blocked'} ++;
756	return (0,undef); # blocked
757	}
758
759	my $smart_block = $self->{'smart_block'};
760	my $smart_block_BN = $self->{'smart_block_BN'};
761
762	if ($smart_block \|\| $smart_block_BN) {
763	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
764	$self->{'num_blocked'} ++;
765	return (0,undef); # blocked
766	}
767	} else {
768	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
769	$self->{'num_blocked'} ++;
770	return (0,undef); # blocked
771	}
772	if ($self->{'cover_image'}) {
773	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
774	$self->{'num_blocked'} ++;
775	return (0,undef); # blocked
776	}
777	}
778	}
779
780	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
781	return (undef,undef); # can't recognise
782	}
783
784	return (1,$filename);
785	}
786
787	sub read_tidy_file {
788
789	my $self = shift (@_);
790
791	my ($file) = @_;
792
793	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
794
795	return $file;
796	}
797
798
799	sub filename_to_metadata
800	{
801	my $self = shift (@_);
802	my ($file, $encoding) = @_;
803
804	my $outhandle = $self->{'outhandle'};
805
806	my $filesystem_encoding = undef;
807
808	eval {
809	use POSIX qw(locale_h);
810
811	# With only one parameter, setlocale retrieves the current value
812	my $current_locale = setlocale(LC_CTYPE);
813
814	if ($current_locale =~ m/^.\.(.?)$/) {
815	my $char_encoding = lc($1);
816	$char_encoding =~ s/-/_/g;
817	$char_encoding =~ s/^utf_8$/utf8/;
818
819	if ($char_encoding =~ m/^\d+$/) {
820	if (defined $encodings::encoding->{"windows_$char_encoding"}) {
821	$char_encoding = "windows_$char_encoding";
822	}
823	elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
824	$char_encoding = "dos_$char_encoding";
825	}
826	}
827
828	if (($char_encoding =~ m/(?:ascii\|utf8\|unicode)/)
829	\|\| (defined $encodings::encoding->{$char_encoding})) {
830	$filesystem_encoding = $char_encoding;
831	}
832	else {
833	print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
834	}
835	}
836
837
838	};
839	if ($@) {
840	print $outhandle "$@\n";
841	print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
842
843	}
844
845	my ($filemeta) = $file =~ /([^\\\/]+)$/;
846
847	# how do we know what encoding the filename is in?
848	# => one answer is to check the locale
849
850	if (defined $filesystem_encoding) {
851	if ($filesystem_encoding !~ /(?:ascii\|utf8\|unicode)/) {
852	$filemeta = unicode::unicode2utf8(
853	unicode::convert2unicode($filesystem_encoding, \$filemeta)
854	);
855	}
856	}
857	# assume it is in the same encoding as its contents
858	elsif ((defined $encoding) && ($encoding !~ /(?:ascii\|utf8\|unicode)/)) {
859	$filemeta = unicode::unicode2utf8(
860	unicode::convert2unicode($encoding, \$filemeta)
861	);
862	}
863
864	my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
865
866	return $dmsafe_filemeta;
867	}
868
869
870	# The BasPlug read_into_doc_obj() function. This function does all the
871	# right things to make general options work for a given plugin. It reads in
872	# a file and sets up a slew of metadata all saved in doc_obj, which
873	# it then returns as part of a tuple (process_status,doc_obj)
874	#
875	# Much of this functionality used to reside in read, but it was broken
876	# down into a supporting routine to make the code more flexible.
877	#
878	# recursive plugins (e.g. RecPlug) and specialized plugins like those
879	# capable of processing many documents within a single file (e.g.
880	# GMLPlug) will normally want to implement their own version of
881	# read_into_doc_obj()
882	#
883	# Note that $base_dir might be "" and that $file might
884	# include directories
885	sub read_into_doc_obj {
886	my $self = shift (@_);
887	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
888
889	if ($self->is_recursive()) {
890	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
891	}
892
893	my $outhandle = $self->{'outhandle'};
894
895	my ($block_status,$filename) = $self->read_block(@_);
896	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
897	$file = $self->read_tidy_file($file);
898
899	# Do encoding stuff
900	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
901	if ($self->{'verbosity'} > 2) {
902	print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
903	}
904
905	# create a new document
906	my $doc_obj = new doc ($filename, "indexed_doc");
907	my $top_section = $doc_obj->get_top_section();
908
909	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
910	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
911	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
912	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
913	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
914
915	my $filemeta = $self->filename_to_metadata($file,$encoding);
916	$doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
917	if ($self->{'cover_image'}) {
918	$self->associate_cover_image($doc_obj, $filename);
919	}
920
921	# read in file ($text will be in utf8)
922	my $text = "";
923	$self->read_file ($filename, $encoding, $language, \$text);
924
925	if (!length ($text)) {
926	my $plugin_name = ref ($self);
927	if ($gli) {
928	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
929	}
930	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
931
932	my $failhandle = $self->{'failhandle'};
933	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
934	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
935	$self->{'num_not_processed'} ++;
936
937	return (0,undef); # what should we return here?? error but don't want to pass it on
938	}
939
940	# include any metadata passed in from previous plugins
941	# note that this metadata is associated with the top level section
942
943	my $associate_tail_re = $self->{'associate_tail_re'};
944
945	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
946
947	# do plugin specific processing of doc_obj
948	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
949	$text = '';
950	undef $text;
951	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
952	return (-1,undef);
953	}
954	$text='';
955	undef $text;
956
957	# do any automatic metadata extraction
958	$self->auto_extract_metadata ($doc_obj);
959
960	# add an OID
961	# see if there is a plugin-specific set_OID function...
962	if (defined ($self->can('set_OID'))) {
963	# it will need $doc_obj to set the Identifier metadata...
964	$self->set_OID($doc_obj);
965	} else {
966	# use the default set_OID() in doc.pm
967	$doc_obj->set_OID();
968	}
969
970	return (1,$doc_obj);
971	}
972
973
974	# The BasPlug read() function. This function calls read_into_doc_obj()
975	# to ensure all the right things to make general options work for a
976	# given plugin are done. It then calls the process() function which
977	# does all the work specific to a plugin (like the old read functions
978	# used to do). Most plugins should define their own process() function
979	# and let this read() function keep control.
980	#
981	# recursive plugins (e.g. RecPlug) and specialized plugins like those
982	# capable of processing many documents within a single file (e.g.
983	# GMLPlug) might want to implement their own version of read(), but
984	# more likely need to implement their own version of read_into_doc_obj()
985	#
986	# Return number of files processed, undef if can't recognise, -1 if can't
987	# process
988
989	sub read {
990	my $self = shift (@_);
991	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
992
993	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
994
995	if ((defined $process_status) && ($process_status == 1)) {
996	# process the document
997	$processor->process($doc_obj);
998
999	if(defined($self->{'places_filename'})){
1000	&util::rm($self->{'places_filename'});
1001	$self->{'places_filename'} = undef;
1002	}
1003
1004	$self->{'num_processed'} ++;
1005	undef $doc_obj;
1006	}
1007
1008	# if process_status == 1, then the file has been processed.
1009	return $process_status;
1010
1011	}
1012
1013	# returns undef if file is rejected by the plugin
1014	sub process {
1015	my $self = shift (@_);
1016	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1017
1018	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
1019	# die "Basplug::process function must be implemented in sub-class\n";
1020
1021	return undef; # never gets here
1022	}
1023
1024	# uses the multiread package to read in the entire file pointed to
1025	# by filename and loads the resulting text into $$textref. Input text
1026	# may be in any of the encodings handled by multiread, output text
1027	# will be in utf8
1028	sub read_file {
1029	my $self = shift (@_);
1030	my ($filename, $encoding, $language, $textref) = @_;
1031
1032	if (!-r $filename)
1033	{
1034	my $outhandle = $self->{'outhandle'};
1035	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
1036	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
1037	return;
1038	}
1039	$$textref = "";
1040	if (!open (FILE, $filename)) {
1041	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1042	die "\n";
1043	}
1044
1045	if ($encoding eq "ascii") {
1046	undef $/;
1047	$$textref = <FILE>;
1048	$/ = "\n";
1049	} else {
1050	my $reader = new multiread();
1051	$reader->set_handle ('BasPlug::FILE');
1052	$reader->set_encoding ($encoding);
1053	$reader->read_file ($textref);
1054	#Now segments chinese if the separate_cjk option is set
1055	if ($self->{'separate_cjk'}) {
1056	# segment the Chinese words
1057	$$textref = &cnseg::segment($$textref);
1058	}
1059	}
1060	close FILE;
1061	}
1062
1063	# write_file -- used by ConvertToPlug, for example in post processing
1064	#
1065	sub utf8_write_file {
1066	my $self = shift (@_);
1067	my ($textref, $filename) = @_;
1068
1069	if (!open (FILE, ">$filename")) {
1070	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1071	die "\n";
1072	}
1073	print FILE $$textref;
1074
1075	close FILE;
1076	}
1077
1078
1079	sub filename_based_title
1080	{
1081	my $self = shift (@_);
1082	my ($file) = @_;
1083
1084	my $file_derived_title = $file;
1085	$file_derived_title =~ s/_/ /g;
1086	$file_derived_title =~ s/\..*?$//;
1087
1088	return $file_derived_title;
1089	}
1090
1091
1092	sub title_fallback
1093	{
1094	my $self = shift (@_);
1095	my ($doc_obj,$section,$file) = @_;
1096
1097	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1098
1099	my $file_derived_title = $self->filename_based_title($file);
1100	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
1101	}
1102	}
1103
1104	sub textcat_get_language_encoding {
1105	my $self = shift (@_);
1106	my ($filename) = @_;
1107
1108
1109	my ($language, $encoding, $extracted_encoding);
1110	if ($self->{'input_encoding'} eq "auto") {
1111	# use textcat to automatically work out the input encoding and language
1112	($language, $encoding) = $self->get_language_encoding ($filename);
1113	} elsif ($self->{'extract_language'}) {
1114	# use textcat to get language metadata
1115	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1116	$encoding = $self->{'input_encoding'};
1117	# don't print this message for english... english in utf8 is identical
1118	# to english in iso-8859-1 (except for some punctuation). We don't have
1119	# a language model for en_utf8, so textcat always says iso-8859-1!
1120	if ($extracted_encoding ne $encoding && $language ne "en"
1121	&& $self->{'verbosity'}) {
1122	my $plugin_name = ref ($self);
1123	my $outhandle = $self->{'outhandle'};
1124	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
1125	}
1126	} else {
1127	$language = $self->{'default_language'};
1128	$encoding = $self->{'input_encoding'};
1129	}
1130
1131	return ($language, $encoding);
1132	}
1133
1134	# Uses textcat to work out the encoding and language of the text in
1135	# $filename. All html tags are removed before processing.
1136	# returns an array containing "language" and "encoding"
1137	sub get_language_encoding {
1138	my $self = shift (@_);
1139	my ($filename) = @_;
1140	my $outhandle = $self->{'outhandle'};
1141	my $unicode_format = "";
1142	my $best_language = "";
1143	my $best_encoding = "";
1144
1145	# read in file
1146	if (!open (FILE, $filename)) {
1147	gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1148	# this is a pretty bad error, but try to continue anyway
1149	return ($self->{'default_language'}, $self->{'input_encoding'});
1150	}
1151	undef $/;
1152	my $text = <FILE>;
1153	$/ = "\n";
1154	close FILE;
1155
1156	# check if first few bytes have a Byte Order Marker
1157	my $bom=substr($text,0,2); # check 16bit unicode
1158	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1159	$unicode_format="unicode";
1160	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1161	$unicode_format="unicode";
1162	} else {
1163	$bom=substr($text,0,3); # check utf-8
1164	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1165	$unicode_format="utf8";
1166	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1167	# $unicode_format="utf8";
1168	}
1169	}
1170
1171
1172	# handle html files specially
1173	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1174	if (ref($self) eq 'HTMLPlug' \|\|
1175	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1176
1177	# remove <title>stuff</title> -- as titles tend often to be in English
1178	# for foreign language documents
1179	$text =~ s!<title>.*?</title>!!si;
1180
1181	# see if this html file specifies its encoding
1182	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1183	$best_encoding = $1;
1184	} elsif ($text =~ /<meta http-equiv.content-type.charset=(.+?)"/i) {#"
1185	$best_encoding = $1;
1186	}
1187	if ($best_encoding) { # we extracted an encoding
1188	$best_encoding =~ s/-+/_/g;
1189	$best_encoding = lc($best_encoding); # lowercase
1190	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1191	$self->{'input_encoding'} = $best_encoding;
1192	}
1193
1194	# remove all HTML tags
1195	$text =~ s/<[^>]*>//sg;
1196	}
1197
1198	# get the language/encoding
1199	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
1200	my $results = $self->{'textcat'}->classify(\$text);
1201
1202	# if textcat returns 3 or less possibilities we'll use the
1203	# first one in the list - otherwise use the defaults
1204	if (scalar @$results > 3) {
1205	if ($unicode_format) { # in case the first had a BOM
1206	$best_encoding=$unicode_format;
1207	} else {
1208	my %guessed_encodings = ();
1209	foreach my $result (@$results) {
1210	$result =~ /([^\-]+)$/;
1211	my $enc=$1;
1212	if (!defined($guessed_encodings{$enc})) {
1213	$guessed_encodings{$enc}=0;
1214	}
1215	$guessed_encodings{$enc}++;
1216	}
1217
1218	$guessed_encodings{""}=-1; # for default best_encoding of ""
1219	foreach my $enc (keys %guessed_encodings) {
1220	if ($guessed_encodings{$enc} >
1221	$guessed_encodings{$best_encoding}){
1222	$best_encoding=$enc;
1223	}
1224	}
1225	}
1226
1227	if ($self->{'input_encoding'} ne 'auto') {
1228	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1229	gsprintf($outhandle,
1230	"BasPlug: {BasPlug.could_not_extract_language}\n",
1231	$filename, $self->{'default_language'});
1232	}
1233	$best_language = $self->{'default_language'};
1234	$best_encoding = $self->{'input_encoding'};
1235
1236	} else {
1237	if ($self->{'verbosity'}>2) {
1238	gsprintf($outhandle,
1239	"BasPlug: {BasPlug.could_not_extract_language}\n",
1240	$filename, $self->{'default_language'});
1241	}
1242	$best_language = $self->{'default_language'};
1243	}
1244	} else { # <= 3 suggestions
1245	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
1246	if (!defined $language) {
1247	if ($self->{'verbosity'}>2) {
1248	gsprintf($outhandle,
1249	"BasPlug: {BasPlug.could_not_extract_language}\n",
1250	$filename, $self->{'default_language'});
1251	}
1252	$language = $self->{'default_language'};
1253	}
1254	if (!defined $encoding) {
1255	if ($self->{'verbosity'}>2) {
1256	gsprintf($outhandle,
1257	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
1258	$filename, $self->{'default_encoding'});
1259	}
1260	$encoding = $self->{'default_encoding'};
1261	}
1262	$best_language = $language;
1263	if (! $best_encoding ) { # may already be set... eg from html meta tag
1264	$best_encoding = $encoding;
1265	}
1266	}
1267
1268	my $text_copy = $text;
1269	if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1270	# the text is valid utf8, so assume that's the real encoding
1271	# (since textcat is based on probabilities)
1272	$best_encoding = 'utf8';
1273	}
1274
1275	# check for equivalents where textcat doesn't have some encodings...
1276	# eg MS versions of standard encodings
1277	if ($best_encoding =~ /^iso_8859_(\d+)/) {
1278	my $iso = $1; # which variant of the iso standard?
1279	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1280	if ($text =~ /[\x80-\x9f]/) {
1281	# Western Europe
1282	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1283	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1284	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1285	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1286	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1287	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1288	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
1289	}
1290	}
1291
1292	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1293	!defined $encodings::encodings->{$best_encoding}) {
1294	if ($self->{'verbosity'}) {
1295	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1296	$filename, $best_encoding, $self->{'default_encoding'});
1297	}
1298	$best_encoding = $self->{'default_encoding'};
1299	}
1300
1301	return ($best_language, $best_encoding);
1302	}
1303
1304	# add any extra metadata that's been passed around from one
1305	# plugin to another.
1306	# extra_metadata uses add_utf8_metadata so it expects metadata values
1307	# to already be in utf8
1308	sub extra_metadata {
1309	my $self = shift (@_);
1310	my ($doc_obj, $cursection, $metadata) = @_;
1311
1312	my $associate_tail_re = $self->{'associate_tail_re'};
1313
1314	foreach my $field (keys(%$metadata)) {
1315	# $metadata->{$field} may be an array reference
1316	if ($field eq "gsdlassocfile_tobe") {
1317	# 'gsdlassocfile_tobe' is artificially introduced metadata
1318	# that is used to signal that certain additional files should
1319	# be tied to this document. Useful in situations where a
1320	# metadata pass in the plugin pipeline works out some files
1321	# need to be associated with a document, but the document hasn't
1322	# been formed yet.
1323
1324	my $equiv_form = "";
1325	foreach my $gaf (@{$metadata->{$field}}) {
1326	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1327	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1328	my $filename = $full_filename;
1329
1330	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1331
1332	# work out extended tail extension (i.e. matching tail re)
1333
1334	my ($file_prefix,$file_extended_ext)
1335	= $self->root_ext_split($tail_filename,$associate_tail_re);
1336	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
1337
1338	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1339	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1340	my $srcicon = "_icon".$doc_ext."_";
1341	my $end_doclink = "</a>";
1342
1343	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1344
1345	if (defined $pre_doc_ext) {
1346	# for metadata such as [mp3._edited] [mp3._full] ...
1347	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1348	}
1349
1350	# for multiple metadata such as [mp3.assoclink]
1351	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1352
1353	$equiv_form .= " $assoc_form";
1354	}
1355	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1356	}
1357	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1358	map {
1359	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1360	} @{$metadata->{$field}};
1361	} else {
1362	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1363	}
1364	}
1365	}
1366
1367	# initialise metadata extractors
1368	sub initialise_extractors {
1369	my $self = shift (@_);
1370
1371	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1372	&acronym::initialise_acronyms();
1373	}
1374	}
1375
1376	# finalise metadata extractors
1377	sub finalise_extractors {
1378	my $self = shift (@_);
1379
1380	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1381	&acronym::finalise_acronyms();
1382	}
1383	}
1384
1385	# FIRSTNNN: extract the first NNN characters as metadata
1386	sub extract_first_NNNN_characters {
1387	my $self = shift (@_);
1388	my ($textref, $doc_obj, $thissection) = @_;
1389
1390	foreach my $size (split /,/, $self->{'first'}) {
1391	my $tmptext = $$textref;
1392	$tmptext =~ s/^\s+//;
1393	$tmptext =~ s/\s+$//;
1394	$tmptext =~ s/\s+/ /gs;
1395	$tmptext = substr ($tmptext, 0, $size);
1396	$tmptext =~ s/\s\S*$/…/;
1397	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1398	}
1399	}
1400
1401	sub extract_email {
1402	my $self = shift (@_);
1403	my ($textref, $doc_obj, $thissection) = @_;
1404	my $outhandle = $self->{'outhandle'};
1405
1406	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1407	if ($self->{'verbosity'} > 2);
1408
1409	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1410	@email = sort @email;
1411
1412	# if($self->{"new_extract_email"} == 0)
1413	# {
1414	# my @email2 = ();
1415	# foreach my $address (@email)
1416	# {
1417	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
1418	# {
1419	# push @email2, $address;
1420	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1421	# # print $outhandle " extracting $address\n"
1422	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1423	# if ($self->{'verbosity'} > 3);
1424	# }
1425	# }
1426	# }
1427	# else
1428	# {
1429	my $hashExistMail = {};
1430	foreach my $address (@email) {
1431	if (!(defined $hashExistMail->{$address}))
1432	{
1433	$hashExistMail->{$address} = 1;
1434	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1435	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1436	if ($self->{'verbosity'} > 3);
1437	}
1438	}
1439	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1440	if ($self->{'verbosity'} > 2);
1441	}
1442
1443	# extract metadata
1444	sub auto_extract_metadata {
1445
1446	my $self = shift (@_);
1447	my ($doc_obj) = @_;
1448
1449	if ($self->{'extract_email'}) {
1450	my $thissection = $doc_obj->get_top_section();
1451	while (defined $thissection) {
1452	my $text = $doc_obj->get_text($thissection);
1453	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1454	$thissection = $doc_obj->get_next_section ($thissection);
1455	}
1456	}
1457	if ($self->{'extract_placenames'}) {
1458	my $thissection = $doc_obj->get_top_section();
1459	while (defined $thissection) {
1460	my $text = $doc_obj->get_text($thissection);
1461	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1462	$thissection = $doc_obj->get_next_section ($thissection);
1463	}
1464	}
1465
1466	if ($self->{'extract_keyphrases'} \|\| $self->{'extract_keyphrases_kea4'}) {
1467	$self->extract_keyphrases($doc_obj);
1468	}
1469
1470	if ($self->{'first'}) {
1471	my $thissection = $doc_obj->get_top_section();
1472	while (defined $thissection) {
1473	my $text = $doc_obj->get_text($thissection);
1474	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1475	$thissection = $doc_obj->get_next_section ($thissection);
1476	}
1477	}
1478
1479	if ($self->{'extract_acronyms'}) {
1480	my $thissection = $doc_obj->get_top_section();
1481	while (defined $thissection) {
1482	my $text = $doc_obj->get_text($thissection);
1483	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1484	$thissection = $doc_obj->get_next_section ($thissection);
1485	}
1486	}
1487
1488	if ($self->{'markup_acronyms'}) {
1489	my $thissection = $doc_obj->get_top_section();
1490	while (defined $thissection) {
1491	my $text = $doc_obj->get_text($thissection);
1492	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1493	$doc_obj->delete_text($thissection);
1494	$doc_obj->add_text($thissection, $text);
1495	$thissection = $doc_obj->get_next_section ($thissection);
1496	}
1497	}
1498
1499	if($self->{'extract_historical_years'}) {
1500	my $thissection = $doc_obj->get_top_section();
1501	while (defined $thissection) {
1502
1503	my $text = $doc_obj->get_text($thissection);
1504	&DateExtract::get_date_metadata($text, $doc_obj,
1505	$thissection,
1506	$self->{'no_bibliography'},
1507	$self->{'maximum_year'},
1508	$self->{'maximum_century'});
1509	$thissection = $doc_obj->get_next_section ($thissection);
1510	}
1511	}
1512	}
1513
1514
1515	#adding kea keyphrases
1516	sub extract_keyphrases
1517	{
1518	my $self = shift(@_);
1519	my $doc_obj = shift(@_);
1520
1521	# Use Kea 3.0 unless 4.0 has been specified
1522	my $kea_version = "3.0";
1523	if ($self->{'extract_keyphrases_kea4'}) {
1524	$kea_version = "4.0";
1525	}
1526
1527	# Check that Kea exists, and tell the user where to get it if not
1528	my $keahome = &Kea::get_Kea_directory($kea_version);
1529	if (!-e $keahome) {
1530	gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1531	return;
1532	}
1533
1534	my $thissection = $doc_obj->get_top_section();
1535	my $text = "";
1536	my $list;
1537
1538	#loop through sections to gather whole doc
1539	while (defined $thissection) {
1540	my $sectiontext = $doc_obj->get_text($thissection);
1541	$text = $text.$sectiontext;
1542	$thissection = $doc_obj->get_next_section ($thissection);
1543	}
1544
1545	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1546	$list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1547	} else { #otherwise call Kea with no options
1548	$list = &Kea::extract_KeyPhrases ($kea_version, $text);
1549	}
1550
1551	if ($list){
1552	# if a list of kea keyphrases was returned (ie not empty)
1553	if ($self->{'verbosity'}) {
1554	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1555	}
1556
1557	#add metadata to top section
1558	$thissection = $doc_obj->get_top_section();
1559
1560	# add all key phrases as one metadata
1561	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1562
1563	# add individual key phrases as multiple metadata
1564	foreach my $keyphrase (split(',', $list)) {
1565	$keyphrase =~ s/^\s+\|\s+$//g;
1566	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1567	}
1568	}
1569	}
1570
1571
1572	# extract acronyms from a section in a document. progress is
1573	# reported to outhandle based on the verbosity. both the Acronym
1574	# and the AcronymKWIC metadata items are created.
1575
1576	sub extract_acronyms {
1577	my $self = shift (@_);
1578	my ($textref, $doc_obj, $thissection) = @_;
1579	my $outhandle = $self->{'outhandle'};
1580
1581	# print $outhandle " extracting acronyms ...\n"
1582	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1583	if ($self->{'verbosity'} > 2);
1584
1585	my $acro_array = &acronym::acronyms($textref);
1586
1587	foreach my $acro (@$acro_array) {
1588
1589	#check that this is the first time ...
1590	my $seen_before = "false";
1591	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1592	foreach my $thisAcro (@$previous_data) {
1593	if ($thisAcro eq $acro->to_string()) {
1594	$seen_before = "true";
1595	if ($self->{'verbosity'} >= 4) {
1596	gsprintf($outhandle, " {BasPlug.already_seen} " .
1597	$acro->to_string() . "\n");
1598	}
1599	}
1600	}
1601
1602	if ($seen_before eq "false") {
1603	#write it to the file ...
1604	$acro->write_to_file();
1605
1606	#do the normal acronym
1607	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1608	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1609	if ($self->{'verbosity'} > 3);
1610	}
1611	}
1612
1613	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1614	if ($self->{'verbosity'} > 2);
1615	}
1616
1617	sub markup_acronyms {
1618	my $self = shift (@_);
1619	my ($text, $doc_obj, $thissection) = @_;
1620	my $outhandle = $self->{'outhandle'};
1621
1622	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1623	if ($self->{'verbosity'} > 2);
1624
1625	#self is passed in to check for verbosity ...
1626	$text = &acronym::markup_acronyms($text, $self);
1627
1628	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1629	if ($self->{'verbosity'} > 2);
1630
1631	return $text;
1632	}
1633
1634	sub compile_stats {
1635	my $self = shift(@_);
1636	my ($stats) = @_;
1637
1638	$stats->{'num_processed'} += $self->{'num_processed'};
1639	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1640	$stats->{'num_archives'} += $self->{'num_archives'};
1641
1642	}
1643
1644	sub associate_cover_image {
1645	my $self = shift;
1646	my ($doc_obj, $filename) = @_;
1647
1648	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1649	if (exists $self->{'covers_missing_cache'}->{$filename}) {
1650	# don't stat() for existence eg for multiple document input files
1651	# (eg SplitPlug)
1652	return;
1653	}
1654
1655	my $top_section=$doc_obj->get_top_section();
1656
1657	if (-e $filename) {
1658	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1659	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1660	} else {
1661	my $upper_filename = $filename;
1662	$upper_filename =~ s/jpg$/JPG/;
1663	if (-e $upper_filename) {
1664	$doc_obj->associate_file($upper_filename, "cover.jpg",
1665	"image/jpeg");
1666	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1667	} else {
1668	# file doesn't exist, so record the fact that it's missing so
1669	# we don't stat() again (stat is slow)
1670	$self->{'covers_missing_cache'}->{$filename} = 1;
1671	}
1672	}
1673
1674	}
1675
1676
1677	# Overridden by exploding plugins (eg. ISISPlug)
1678	sub clean_up_after_exploding
1679	{
1680	my $self = shift(@_);
1681	}
1682
1683
1684	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: