Context Navigation

source: gsdl/trunk/perllib/plugins/BasPlug.pm@ 15018

Last change on this file since 15018 was 15018, checked in by davidb, 16 years ago
Marc mapping upgraded to support richer set of operations, including subfields, multiple fields in one line (separated by comma), and the removal of rules, e.g. -245 at the start of a line. A Marc to Qualified Dublin Core crosswalk from the Library of congress has been added as "etc/marc2qdc.txt". A collection can then choose to, for example, top up the mapping with its own version of the file stored in its local "etc" folder, specifying only the rules that are different. This is where a rule like "-245" might be used to override a more general rule from the main file that has all subfields in 245 mapping to one metadata item (Title). If the user specifies a different different filename -- through a plugin option -- then they are free to divise a mapping from scratch and store it in the collections local "etc" folder.
Property svn:keywords set to `Author Date Id Revision`
File size: 49.4 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39	no strict 'subs';
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	use File::Basename;
43
44	use Kea;
45	use multiread;
46	use encodings;
47	use unicode;
48	use cnseg;
49	use acronym;
50	use textcat;
51	use doc;
52	eval "require diagnostics"; # some perl distros (eg mac) don't have this
53	use DateExtract;
54	use ghtml;
55	use gsprintf 'gsprintf';
56	use printusage;
57	use parse2;
58
59
60	use GISBasPlug;
61
62	@BasPlug::ISA = ( GISBasPlug );
63
64	my $unicode_list =
65	[ { 'name' => "ascii",
66	'desc' => "{BasPlug.input_encoding.ascii}" },
67	{ 'name' => "utf8",
68	'desc' => "{BasPlug.input_encoding.utf8}" },
69	{ 'name' => "unicode",
70	'desc' => "{BasPlug.input_encoding.unicode}" } ];
71
72	my $auto_unicode_list =
73	[ { 'name' => "auto",
74	'desc' => "{BasPlug.input_encoding.auto}" } ];
75
76	my $e = $encodings::encodings;
77	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78	{
79	my $hashEncode =
80	{'name' => $enc,
81	'desc' => $e->{$enc}->{'name'}};
82
83	push(@{$unicode_list},$hashEncode);
84	}
85
86	push(@{$auto_unicode_list},@{$unicode_list});
87
88	my $arguments =
89	[ { 'name' => "process_exp",
90	'desc' => "{BasPlug.process_exp}",
91	'type' => "regexp",
92	'deft' => "",
93	'reqd' => "no" },
94	{ 'name' => "block_exp",
95	'desc' => "{BasPlug.block_exp}",
96	'type' => "regexp",
97	'deft' => "",
98	'reqd' => "no" },
99	{ 'name' => "smart_block",
100	'desc' => "{BasPlug.smart_block}",
101	'type' => "flag",
102	'reqd' => "no" },
103	{ 'name' => "associate_ext",
104	'desc' => "{BasPlug.associate_ext}",
105	'type' => "string",
106	'reqd' => "no" },
107	{ 'name' => "associate_tail_re",
108	'desc' => "{BasPlug.associate_tail_re}",
109	'type' => "string",
110	'reqd' => "no" },
111	{ 'name' => "use_as_doc_identifier",
112	'desc' => "{BasPlug.use_as_doc_identifier}",
113	'type' => "string",
114	'reqd' => "no" ,
115	'deft' => "" } ,
116	{ 'name' => "input_encoding",
117	'desc' => "{BasPlug.input_encoding}",
118	'type' => "enum",
119	'list' => $auto_unicode_list,
120	'reqd' => "no" ,
121	'deft' => "auto" } ,
122	{ 'name' => "default_encoding",
123	'desc' => "{BasPlug.default_encoding}",
124	'type' => "enum",
125	'list' => $unicode_list,
126	'reqd' => "no",
127	'deft' => "utf8" },
128	{ 'name' => "extract_language",
129	'desc' => "{BasPlug.extract_language}",
130	'type' => "flag",
131	'reqd' => "no" },
132	{ 'name' => "default_language",
133	'desc' => "{BasPlug.default_language}",
134	'type' => "string",
135	'deft' => "en",
136	'reqd' => "no" },
137	{ 'name' => "extract_acronyms",
138	'desc' => "{BasPlug.extract_acronyms}",
139	'type' => "flag",
140	'reqd' => "no" },
141	{ 'name' => "markup_acronyms",
142	'desc' => "{BasPlug.markup_acronyms}",
143	'type' => "flag",
144	'reqd' => "no" },
145	{ 'name' => "extract_keyphrases",
146	'desc' => "{BasPlug.extract_keyphrases}",
147	'type' => "flag",
148	'reqd' => "no" },
149	{ 'name' => "extract_keyphrases_kea4",
150	'desc' => "{BasPlug.extract_keyphrases_kea4}",
151	'type' => "flag",
152	'reqd' => "no" },
153	{ 'name' => "extract_keyphrase_options",
154	'desc' => "{BasPlug.extract_keyphrase_options}",
155	'type' => "string",
156	'deft' => "",
157	'reqd' => "no" },
158	{ 'name' => "first",
159	'desc' => "{BasPlug.first}",
160	'type' => "string",
161	'reqd' => "no" },
162	{ 'name' => "extract_email",
163	'desc' => "{BasPlug.extract_email}",
164	'type' => "flag",
165	'reqd' => "no" },
166	{ 'name' => "extract_historical_years",
167	'desc' => "{BasPlug.extract_historical_years}",
168	'type' => "flag",
169	'reqd' => "no" },
170	{ 'name' => "maximum_year",
171	'desc' => "{BasPlug.maximum_year}",
172	'type' => "int",
173	'deft' => (localtime)[5]+1900,
174	'char_length' => "4",
175	#'range' => "2,100",
176	'reqd' => "no"},
177	{ 'name' => "maximum_century",
178	'desc' => "{BasPlug.maximum_century}",
179	'type' => "string",
180	'deft' => "-1",
181	'reqd' => "no" },
182	{ 'name' => "no_bibliography",
183	'desc' => "{BasPlug.no_bibliography}",
184	'type' => "flag",
185	'reqd' => "no"},
186	{ 'name' => "no_cover_image",
187	'desc' => "{BasPlug.no_cover_image}",
188	'type' => "flag",
189	'reqd' => "no" },
190	{ 'name' => "separate_cjk",
191	'desc' => "{BasPlug.separate_cjk}",
192	'type' => "flag",
193	'reqd' => "no",
194	'hiddengli' => "yes" },
195	{ 'name' => "new_extract_email",
196	'desc' => "",
197	'type' => "flag",
198	'reqd' => "no",
199	'hiddengli' => "yes" } ];
200
201	my $gis_arguments =
202	[ { 'name' => "extract_placenames",
203	'desc' => "{GISBasPlug.extract_placenames}",
204	'type' => "flag",
205	'reqd' => "no" },
206	{ 'name' => "gazetteer",
207	'desc' => "{GISBasPlug.gazetteer}",
208	'type' => "string",
209	'reqd' => "no" },
210	{ 'name' => "place_list",
211	'desc' => "{GISBasPlug.place_list}",
212	'type' => "flag",
213	'reqd' => "no" } ];
214
215
216	my $options = { 'name' => "BasPlug",
217	'desc' => "{BasPlug.desc}",
218	'abstract' => "yes",
219	'inherits' => "no",
220	'args' => $arguments };
221
222
223	sub set_incremental {
224	my $self = shift(@_);
225	my ($incremental) = @_;
226
227	$self->{'incremental'} = $incremental;
228	}
229
230	sub get_arguments
231	{
232	my $self = shift(@_);
233	my $optionlistref = $self->{'option_list'};
234	my @optionlist = @$optionlistref;
235	my $pluginoptions = pop(@$optionlistref);
236	my $pluginarguments = $pluginoptions->{'args'};
237	return $pluginarguments;
238	}
239
240
241	sub print_xml_usage
242	{
243	my $self = shift(@_);
244	my $header = shift(@_);
245	my $high_level_information_only = shift(@_);
246
247	# XML output is always in UTF-8
248	gsprintf::output_strings_in_UTF8;
249
250	if ($header) {
251	&PrintUsage::print_xml_header("plugin");
252	}
253	$self->print_xml($high_level_information_only);
254	}
255
256
257	sub print_xml
258	{
259	my $self = shift(@_);
260	my $high_level_information_only = shift(@_);
261
262	my $optionlistref = $self->{'option_list'};
263	my @optionlist = @$optionlistref;
264	my $pluginoptions = shift(@$optionlistref);
265	return if (!defined($pluginoptions));
266
267	# Find the process and block default expressions in the plugin arguments
268	my $process_exp = "";
269	my $block_exp = "";
270	if (defined($pluginoptions->{'args'})) {
271	foreach my $option (@{$pluginoptions->{'args'}}) {
272	if ($option->{'name'} eq "process_exp") {
273	$process_exp = $option->{'deft'};
274	}
275	if ($option->{'name'} eq "block_exp") {
276	$block_exp = $option->{'deft'};
277	}
278	}
279	}
280
281	gsprintf(STDERR, "<PlugInfo>\n");
282	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
283	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
284	$desc =~ s/</&lt;/g; # doubly escaped
285	$desc =~ s/>/&gt;/g;
286	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
287	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
288	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
289	gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");
290	gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");
291	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
292	unless (defined($high_level_information_only)) {
293	gsprintf(STDERR, " <Arguments>\n");
294	if (defined($pluginoptions->{'args'})) {
295	&PrintUsage::print_options_xml($pluginoptions->{'args'});
296	}
297	gsprintf(STDERR, " </Arguments>\n");
298
299	# Recurse up the plugin hierarchy
300	$self->print_xml();
301	}
302	gsprintf(STDERR, "</PlugInfo>\n");
303	}
304
305
306	sub print_txt_usage
307	{
308	my $self = shift(@_);
309	# Print the usage message for a plugin (recursively)
310	my $descoffset = $self->determine_description_offset(0);
311	$self->print_plugin_usage($descoffset, 1);
312	}
313
314
315	sub determine_description_offset
316	{
317	my $self = shift(@_);
318	my $maxoffset = shift(@_);
319
320	my $optionlistref = $self->{'option_list'};
321	my @optionlist = @$optionlistref;
322	my $pluginoptions = shift(@$optionlistref);
323	return $maxoffset if (!defined($pluginoptions));
324
325	# Find the length of the longest option string of this plugin
326	my $pluginargs = $pluginoptions->{'args'};
327	if (defined($pluginargs)) {
328	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
329	if ($longest > $maxoffset) {
330	$maxoffset = $longest;
331	}
332	}
333
334	# Recurse up the plugin hierarchy
335	$maxoffset = $self->determine_description_offset($maxoffset);
336	$self->{'option_list'} = \@optionlist;
337	return $maxoffset;
338	}
339
340
341	sub print_plugin_usage
342	{
343	my $self = shift(@_);
344	my $descoffset = shift(@_);
345	my $isleafclass = shift(@_);
346
347	my $optionlistref = $self->{'option_list'};
348	my @optionlist = @$optionlistref;
349	my $pluginoptions = shift(@$optionlistref);
350	return if (!defined($pluginoptions));
351
352	my $pluginname = $pluginoptions->{'name'};
353	my $pluginargs = $pluginoptions->{'args'};
354	my $plugindesc = $pluginoptions->{'desc'};
355
356	# Produce the usage information using the data structure above
357	if ($isleafclass) {
358	if (defined($plugindesc)) {
359	gsprintf(STDERR, "$plugindesc\n\n");
360	}
361	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
362	}
363
364	# Display the plugin options, if there are some
365	if (defined($pluginargs)) {
366	# Calculate the column offset of the option descriptions
367	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
368
369	if ($isleafclass) {
370	gsprintf(STDERR, " {common.specific_options}:\n");
371	}
372	else {
373	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
374	}
375
376	# Display the plugin options
377	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
378	}
379
380	# Recurse up the plugin hierarchy
381	$self->print_plugin_usage($descoffset, 0);
382	$self->{'option_list'} = \@optionlist;
383	}
384
385
386	sub new {
387	# Set Encodings to the list!!
388
389
390	# Start the BasPlug Constructor
391	my $class = shift (@_);
392	my ($pluginlist,$args,$hashArgOptLists) = @_;
393	push(@$pluginlist, $class);
394	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
395
396	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
397	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
398
399	if (GISBasPlug::has_mapdata()) {
400	push(@$arguments,@$gis_arguments);
401	}
402
403	my $self = {};
404	$self->{'outhandle'} = STDERR;
405	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
406	$self->{"info_only"} = 0;
407
408	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
409	# the args, just return the object.
410	foreach my $strArg (@{$args})
411	{
412	if($strArg eq "-gsdlinfo")
413	{
414	$self->{"info_only"} = 1;
415	return bless $self, $class;
416	}
417	}
418
419	if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
420	{
421	my $classTempClass = bless $self, $class;
422	print STDERR "<BadPlugin p=$plugin_name>\n";
423	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
424	$classTempClass->print_txt_usage(""); # Use default resource bundle
425	die "\n";
426	}
427
428
429	delete $self->{"info_only"};
430	# else parsing was successful.
431
432	$self->{'plugin_type'} = $plugin_name;
433	#$self->{'outhandle'} = STDERR;
434	$self->{'num_processed'} = 0;
435	$self->{'num_not_processed'} = 0;
436	$self->{'num_blocked'} = 0;
437	$self->{'num_archives'} = 0;
438	$self->{'cover_image'} = 1; # cover image is on by default
439	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
440	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
441
442	my $associate_ext = $self->{'associate_ext'};
443	if ((defined $associate_ext) && ($associate_ext ne "")) {
444
445	my $associate_tail_re = $self->{'associate_tail_re'};
446	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
447	my $outhandle = $self->{'outhandle'};
448	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
449	print $outhandle " defaulting to 'associate_tail_re'\n";
450	}
451	else {
452	my @exts = split(/,/,$associate_ext);
453
454	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
455	my $associate_tail_re = join("\|",@exts_bracketed);
456	$self->{'associate_tail_re'} = $associate_tail_re;
457	}
458
459	delete $self->{'associate_ext'};
460	}
461
462	$self->{'shared_fileroot'} = {};
463	$self->{'file_blocks'} = {};
464
465	if ($self->{'extract_placenames'}) {
466
467	my $outhandle = $self->{'outhandle'};
468
469	my $places_ref
470	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
471
472	if (!defined $places_ref) {
473	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
474	print $outhandle " No placename extraction will take place.\n";
475	$self->{'extract_placenames'} = undef;
476	}
477	else {
478	$self->{'places'} = $places_ref;
479	}
480	}
481
482	return bless $self, $class;
483
484	}
485
486	# initialize BasPlug options
487	# if init() is overridden in a sub-class, remember to call BasPlug::init()
488	sub init {
489	my $self = shift (@_);
490	my ($verbosity, $outhandle, $failhandle) = @_;
491
492	# verbosity is passed through from the processor
493	$self->{'verbosity'} = $verbosity;
494
495	# as are the outhandle and failhandle
496	$self->{'outhandle'} = $outhandle if defined $outhandle;
497	$self->{'failhandle'} = $failhandle;
498
499	# set process_exp and block_exp to defaults unless they were
500	# explicitly set
501
502	if ((!$self->is_recursive()) and
503	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
504
505	$self->{'process_exp'} = $self->get_default_process_exp ();
506	if ($self->{'process_exp'} eq "") {
507	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
508	}
509	}
510
511	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
512	$self->{'block_exp'} = $self->get_default_block_exp ();
513	}
514
515	}
516
517	sub begin {
518	my $self = shift (@_);
519	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
520	$self->initialise_extractors();
521	}
522
523	sub end {
524	# potentially called at the end of each plugin pass
525	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
526
527	my ($self) = @_;
528	$self->finalise_extractors();
529	}
530
531	sub deinit {
532	# called only once, after all plugin passes have been done
533
534	my ($self) = @_;
535	}
536
537	# this function should be overridden to return 1
538	# in recursive plugins
539	sub is_recursive {
540	my $self = shift (@_);
541
542	return 0;
543	}
544
545	sub get_default_block_exp {
546	my $self = shift (@_);
547
548	return "";
549	}
550
551	sub get_default_process_exp {
552	my $self = shift (@_);
553
554	return "";
555	}
556
557	# default implementation is to do nothing.
558	sub store_block_files
559	{
560	my $self =shift (@_);
561	my ($filename) = @_;
562	return;
563	}
564
565	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
566	sub block_cover_image
567	{
568	my $self =shift;
569	my $filename = shift;
570
571	if ($self->{'cover_image'}) {
572	my $coverfile = $filename;
573	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
574	if (!-e $coverfile) {
575	$coverfile =~ s/jpg$/JPG/;
576	}
577	if (-e $coverfile) {
578	$self->{'file_blocks'}->{$coverfile} = 1;
579	}
580	}
581
582	return;
583	}
584
585	sub root_ext_split
586	{
587	my $self = shift (@_);
588	my ($filename,$tail_re) = @_;
589
590	my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
591
592	if ((!defined $file_prefix) \|\| (!defined $file_ext)) {
593	($file_prefix,$file_ext) = ($filename =~ m/^(.)(\..?)$/);
594	}
595
596	return ($file_prefix,$file_ext);
597	}
598
599	sub metadata_read {
600	my $self = shift (@_);
601	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
602	# Keep track of filenames with same root but different extensions
603	# Used to support -associate_ext and the more generalised
604	# -associate_tail_re
605
606	my $associate_tail_re = $self->{'associate_tail_re'};
607	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
608
609	my ($file_prefix,$file_ext)
610	= $self->root_ext_split($file,$associate_tail_re);
611
612	if ((defined $file_prefix) && (defined $file_ext)) {
613
614	my $shared_fileroot = $self->{'shared_fileroot'};
615	if (!defined $shared_fileroot->{$file_prefix}) {
616	my $file_prefix_rec = { 'tie_to' => undef,
617	'exts' => {} };
618	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
619	}
620
621	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
622
623	my $process_exp = $self->{'process_exp'};
624
625	if ($file =~ m/$process_exp/) {
626	# This is the document the others should be tied to
627	$file_prefix_rec->{'tie_to'} = $file_ext;
628	}
629	else {
630	if ($file_ext =~ m/$associate_tail_re$/) {
631	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
632	}
633	}
634
635	}
636	}
637
638	# now check whether we are actually processing this
639	my $filename = $file;
640	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
641	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
642	return undef; # can't recognise
643	}
644
645	# do smart blocking if appropriate
646	if ($self->{'smart_block'}) {
647	$self->store_block_files($filename);
648	}
649	# block the cover image if there is one
650	if ($self->{'cover_image'}) {
651	$self->block_cover_image($filename);
652	}
653
654	return 1;
655	}
656
657	sub tie_to_filename
658	{
659	my $self = shift (@_);
660
661	my ($file_ext,$file_prefix_rec) = @_;
662
663	if (defined $file_prefix_rec) {
664	my $tie_to = $file_prefix_rec->{'tie_to'};
665
666	if (defined $tie_to) {
667	if ($tie_to eq $file_ext) {
668	return 1;
669	}
670	}
671	}
672
673	return 0;
674	}
675
676	sub tie_to_assoc_file
677	{
678	my $self = shift (@_);
679	my ($file_ext,$file_prefix_rec) = @_;
680
681	if (defined $file_prefix_rec) {
682	my $tie_to = $file_prefix_rec->{'tie_to'};
683	if (defined $tie_to) {
684
685	my $exts = $file_prefix_rec->{'exts'};
686
687	my $has_file_ext = $exts->{$file_ext};
688
689	if ($has_file_ext) {
690	return 1;
691	}
692	}
693	}
694
695	return 0;
696	}
697
698
699	sub associate_with
700	{
701	my $self = shift (@_);
702	my ($file, $filename, $metadata) = @_;
703
704	my $associate_tail_re = $self->{'associate_tail_re'};
705	return 0 if (!$associate_tail_re);
706
707	# If file, see if matches with "tie_to" doc or is one of the
708	# associated filename extensions.
709
710	my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
711
712	if ((defined $file_prefix) && (defined $file_ext)) {
713
714	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
715
716	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
717
718	# Set up gsdlassocfile_tobe
719
720	my $exts = $file_prefix_rec->{'exts'};
721
722	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
723	$metadata->{'gsdlassocfile_tobe'} = [];
724	}
725
726	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
727
728	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
729	foreach my $e (keys %$exts) {
730	my $assoc_file = "$full_prefix$e";
731	print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
732	my $mime_type = ""; # let system auto detect this
733	push(@$assoc_tobe,"$assoc_file:$mime_type:");
734	}
735
736	}
737	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
738
739
740	# a form of smart block
741	return 1;
742	}
743	}
744
745	return 0;
746	}
747
748
749	sub read_block {
750	my $self = shift (@_);
751
752	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
753
754
755	my $filename = $file;
756	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
757
758	if ($self->associate_with($file,$filename,$metadata)) {
759	# a form of smart block
760	$self->{'num_blocked'} ++;
761	return (0,undef); # blocked
762	}
763
764	my $smart_block = $self->{'smart_block'};
765	my $smart_block_BN = $self->{'smart_block_BN'};
766
767	if ($smart_block \|\| $smart_block_BN) {
768	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
769	$self->{'num_blocked'} ++;
770	return (0,undef); # blocked
771	}
772	} else {
773	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
774	$self->{'num_blocked'} ++;
775	return (0,undef); # blocked
776	}
777	if ($self->{'cover_image'}) {
778	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
779	$self->{'num_blocked'} ++;
780	return (0,undef); # blocked
781	}
782	}
783	}
784
785	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
786	return (undef,undef); # can't recognise
787	}
788
789	return (1,$filename);
790	}
791
792	sub read_tidy_file {
793
794	my $self = shift (@_);
795
796	my ($file) = @_;
797
798	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
799
800	return $file;
801	}
802
803
804	sub filename_to_metadata
805	{
806	my $self = shift (@_);
807	my ($file, $encoding) = @_;
808
809	my $outhandle = $self->{'outhandle'};
810
811	my $filesystem_encoding = undef;
812
813	eval {
814	use POSIX qw(locale_h);
815
816	# With only one parameter, setlocale retrieves the current value
817	my $current_locale = setlocale(LC_CTYPE);
818
819	if ($current_locale =~ m/^.\.(.?)$/) {
820	my $char_encoding = lc($1);
821	$char_encoding =~ s/-/_/g;
822	$char_encoding =~ s/^utf_8$/utf8/;
823
824	if ($char_encoding =~ m/^\d+$/) {
825	if (defined $encodings::encoding->{"windows_$char_encoding"}) {
826	$char_encoding = "windows_$char_encoding";
827	}
828	elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
829	$char_encoding = "dos_$char_encoding";
830	}
831	}
832
833	if (($char_encoding =~ m/(?:ascii\|utf8\|unicode)/)
834	\|\| (defined $encodings::encoding->{$char_encoding})) {
835	$filesystem_encoding = $char_encoding;
836	}
837	else {
838	print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
839	}
840	}
841
842
843	};
844	if ($@) {
845	print $outhandle "$@\n";
846	print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
847
848	}
849
850	my ($filemeta) = $file =~ /([^\\\/]+)$/;
851
852	# how do we know what encoding the filename is in?
853	# => one answer is to check the locale
854
855	if (defined $filesystem_encoding) {
856	if ($filesystem_encoding !~ /(?:ascii\|utf8\|unicode)/) {
857	$filemeta = unicode::unicode2utf8(
858	unicode::convert2unicode($filesystem_encoding, \$filemeta)
859	);
860	}
861	}
862	# assume it is in the same encoding as its contents
863	elsif ((defined $encoding) && ($encoding !~ /(?:ascii\|utf8\|unicode)/)) {
864	$filemeta = unicode::unicode2utf8(
865	unicode::convert2unicode($encoding, \$filemeta)
866	);
867	}
868
869	my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
870
871	return $dmsafe_filemeta;
872	}
873
874
875	sub add_OID
876	{
877	my $self = shift (@_);
878	my ($doc_obj) = @_;
879
880	# See if a metadata field is specified as the field
881	if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
882	my $metadata_doc_id = $self->{'use_as_doc_identifier'};
883
884	# Consider "tidying" up metadata_doc_id to be something
885	# suitable in a URL
886	# Could even support a user specified plugin RE for this.
887
888	my $top_section = $doc_obj->get_top_section();
889	my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
890	## print STDERR "**** oid = $oid\n";
891	$doc_obj->set_OID($oid);
892	}
893	# See if there is a plugin-specific set_OID function...
894	elsif (defined ($self->can('set_OID'))) {
895	# it will need $doc_obj to set the Identifier metadata...
896	$self->set_OID(@_); # pass through any extra arguments supplied
897	} else {
898	# use the default set_OID() in doc.pm
899	$doc_obj->set_OID();
900	}
901	}
902
903	# The BasPlug read_into_doc_obj() function. This function does all the
904	# right things to make general options work for a given plugin. It reads in
905	# a file and sets up a slew of metadata all saved in doc_obj, which
906	# it then returns as part of a tuple (process_status,doc_obj)
907	#
908	# Much of this functionality used to reside in read, but it was broken
909	# down into a supporting routine to make the code more flexible.
910	#
911	# recursive plugins (e.g. RecPlug) and specialized plugins like those
912	# capable of processing many documents within a single file (e.g.
913	# GMLPlug) will normally want to implement their own version of
914	# read_into_doc_obj()
915	#
916	# Note that $base_dir might be "" and that $file might
917	# include directories
918	sub read_into_doc_obj {
919	my $self = shift (@_);
920	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
921
922	if ($self->is_recursive()) {
923	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
924	}
925
926	my $outhandle = $self->{'outhandle'};
927
928	my ($block_status,$filename) = $self->read_block(@_);
929	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
930	$file = $self->read_tidy_file($file);
931
932	# Do encoding stuff
933	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
934	if ($self->{'verbosity'} > 2) {
935	print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
936	}
937
938	# create a new document
939	my $doc_obj = new doc ($filename, "indexed_doc");
940	my $top_section = $doc_obj->get_top_section();
941
942	$doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
943	$doc_obj->add_utf8_metadata($top_section, "Language", $language);
944	$doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
945	$doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
946	$doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
947
948	my $filemeta = $self->filename_to_metadata($file,$encoding);
949	$doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
950	if ($self->{'cover_image'}) {
951	$self->associate_cover_image($doc_obj, $filename);
952	}
953
954	# read in file ($text will be in utf8)
955	my $text = "";
956	$self->read_file ($filename, $encoding, $language, \$text);
957
958	if (!length ($text)) {
959	my $plugin_name = ref ($self);
960	if ($gli) {
961	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
962	}
963	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
964
965	my $failhandle = $self->{'failhandle'};
966	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
967	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
968	$self->{'num_not_processed'} ++;
969
970	return (0,undef); # what should we return here?? error but don't want to pass it on
971	}
972
973	# include any metadata passed in from previous plugins
974	# note that this metadata is associated with the top level section
975
976	my $associate_tail_re = $self->{'associate_tail_re'};
977
978	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
979
980	# do plugin specific processing of doc_obj
981	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
982	$text = '';
983	undef $text;
984	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
985	return (-1,undef);
986	}
987	$text='';
988	undef $text;
989
990	# do any automatic metadata extraction
991	$self->auto_extract_metadata ($doc_obj);
992
993	$self->add_OID($doc_obj);
994
995	return (1,$doc_obj);
996	}
997
998
999	# The BasPlug read() function. This function calls read_into_doc_obj()
1000	# to ensure all the right things to make general options work for a
1001	# given plugin are done. It then calls the process() function which
1002	# does all the work specific to a plugin (like the old read functions
1003	# used to do). Most plugins should define their own process() function
1004	# and let this read() function keep control.
1005	#
1006	# recursive plugins (e.g. RecPlug) and specialized plugins like those
1007	# capable of processing many documents within a single file (e.g.
1008	# GMLPlug) might want to implement their own version of read(), but
1009	# more likely need to implement their own version of read_into_doc_obj()
1010	#
1011	# Return number of files processed, undef if can't recognise, -1 if can't
1012	# process
1013
1014	sub read {
1015	my $self = shift (@_);
1016	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1017
1018	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1019
1020	if ((defined $process_status) && ($process_status == 1)) {
1021	# process the document
1022	$processor->process($doc_obj);
1023
1024	if(defined($self->{'places_filename'})){
1025	&util::rm($self->{'places_filename'});
1026	$self->{'places_filename'} = undef;
1027	}
1028
1029	$self->{'num_processed'} ++;
1030	undef $doc_obj;
1031	}
1032
1033	# if process_status == 1, then the file has been processed.
1034	return $process_status;
1035
1036	}
1037
1038	# returns undef if file is rejected by the plugin
1039	sub process {
1040	my $self = shift (@_);
1041	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1042
1043	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
1044	# die "Basplug::process function must be implemented in sub-class\n";
1045
1046	return undef; # never gets here
1047	}
1048
1049	# uses the multiread package to read in the entire file pointed to
1050	# by filename and loads the resulting text into $$textref. Input text
1051	# may be in any of the encodings handled by multiread, output text
1052	# will be in utf8
1053	sub read_file {
1054	my $self = shift (@_);
1055	my ($filename, $encoding, $language, $textref) = @_;
1056
1057	if (!-r $filename)
1058	{
1059	my $outhandle = $self->{'outhandle'};
1060	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
1061	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
1062	return;
1063	}
1064	$$textref = "";
1065	if (!open (FILE, $filename)) {
1066	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1067	die "\n";
1068	}
1069
1070	if ($encoding eq "ascii") {
1071	undef $/;
1072	$$textref = <FILE>;
1073	$/ = "\n";
1074	} else {
1075	my $reader = new multiread();
1076	$reader->set_handle ('BasPlug::FILE');
1077	$reader->set_encoding ($encoding);
1078	$reader->read_file ($textref);
1079	#Now segments chinese if the separate_cjk option is set
1080	if ($self->{'separate_cjk'}) {
1081	# segment the Chinese words
1082	$$textref = &cnseg::segment($$textref);
1083	}
1084	}
1085	close FILE;
1086	}
1087
1088	# write_file -- used by ConvertToPlug, for example in post processing
1089	#
1090	sub utf8_write_file {
1091	my $self = shift (@_);
1092	my ($textref, $filename) = @_;
1093
1094	if (!open (FILE, ">$filename")) {
1095	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1096	die "\n";
1097	}
1098	print FILE $$textref;
1099
1100	close FILE;
1101	}
1102
1103
1104	sub filename_based_title
1105	{
1106	my $self = shift (@_);
1107	my ($file) = @_;
1108
1109	my $file_derived_title = $file;
1110	$file_derived_title =~ s/_/ /g;
1111	$file_derived_title =~ s/\..*?$//;
1112
1113	return $file_derived_title;
1114	}
1115
1116
1117	sub title_fallback
1118	{
1119	my $self = shift (@_);
1120	my ($doc_obj,$section,$file) = @_;
1121
1122	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1123
1124	my $file_derived_title = $self->filename_based_title($file);
1125
1126	$doc_obj->add_utf8_metadata ($section, "Title", $self->filename_to_metadata($file_derived_title));
1127	}
1128	}
1129
1130	sub textcat_get_language_encoding {
1131	my $self = shift (@_);
1132	my ($filename) = @_;
1133
1134
1135	my ($language, $encoding, $extracted_encoding);
1136	if ($self->{'input_encoding'} eq "auto") {
1137	# use textcat to automatically work out the input encoding and language
1138	($language, $encoding) = $self->get_language_encoding ($filename);
1139	} elsif ($self->{'extract_language'}) {
1140	# use textcat to get language metadata
1141	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1142	$encoding = $self->{'input_encoding'};
1143	# don't print this message for english... english in utf8 is identical
1144	# to english in iso-8859-1 (except for some punctuation). We don't have
1145	# a language model for en_utf8, so textcat always says iso-8859-1!
1146	if ($extracted_encoding ne $encoding && $language ne "en"
1147	&& $self->{'verbosity'}) {
1148	my $plugin_name = ref ($self);
1149	my $outhandle = $self->{'outhandle'};
1150	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
1151	}
1152	} else {
1153	$language = $self->{'default_language'};
1154	$encoding = $self->{'input_encoding'};
1155	}
1156
1157	return ($language, $encoding);
1158	}
1159
1160	# Uses textcat to work out the encoding and language of the text in
1161	# $filename. All html tags are removed before processing.
1162	# returns an array containing "language" and "encoding"
1163	sub get_language_encoding {
1164	my $self = shift (@_);
1165	my ($filename) = @_;
1166	my $outhandle = $self->{'outhandle'};
1167	my $unicode_format = "";
1168	my $best_language = "";
1169	my $best_encoding = "";
1170
1171	# read in file
1172	if (!open (FILE, $filename)) {
1173	gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1174	# this is a pretty bad error, but try to continue anyway
1175	return ($self->{'default_language'}, $self->{'input_encoding'});
1176	}
1177	undef $/;
1178	my $text = <FILE>;
1179	$/ = "\n";
1180	close FILE;
1181
1182	# check if first few bytes have a Byte Order Marker
1183	my $bom=substr($text,0,2); # check 16bit unicode
1184	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1185	$unicode_format="unicode";
1186	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1187	$unicode_format="unicode";
1188	} else {
1189	$bom=substr($text,0,3); # check utf-8
1190	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1191	$unicode_format="utf8";
1192	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1193	# $unicode_format="utf8";
1194	}
1195	}
1196
1197
1198	# handle html files specially
1199	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1200	if (ref($self) eq 'HTMLPlug' \|\|
1201	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1202
1203	# remove <title>stuff</title> -- as titles tend often to be in English
1204	# for foreign language documents
1205	$text =~ s!<title>.*?</title>!!si;
1206
1207	# see if this html file specifies its encoding
1208	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1209	$best_encoding = $1;
1210	} elsif ($text =~ /<meta http-equiv.content-type.charset=(.+?)"/i) {#"
1211	$best_encoding = $1;
1212	}
1213	if ($best_encoding) { # we extracted an encoding
1214	$best_encoding =~ s/-+/_/g;
1215	$best_encoding = lc($best_encoding); # lowercase
1216	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1217	$self->{'input_encoding'} = $best_encoding;
1218	}
1219
1220	# remove all HTML tags
1221	$text =~ s/<[^>]*>//sg;
1222	}
1223
1224	# get the language/encoding
1225	$self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
1226	my $results = $self->{'textcat'}->classify(\$text);
1227
1228	# if textcat returns 3 or less possibilities we'll use the
1229	# first one in the list - otherwise use the defaults
1230	if (scalar @$results > 3) {
1231	if ($unicode_format) { # in case the first had a BOM
1232	$best_encoding=$unicode_format;
1233	} else {
1234	my %guessed_encodings = ();
1235	foreach my $result (@$results) {
1236	$result =~ /([^\-]+)$/;
1237	my $enc=$1;
1238	if (!defined($guessed_encodings{$enc})) {
1239	$guessed_encodings{$enc}=0;
1240	}
1241	$guessed_encodings{$enc}++;
1242	}
1243
1244	$guessed_encodings{""}=-1; # for default best_encoding of ""
1245	foreach my $enc (keys %guessed_encodings) {
1246	if ($guessed_encodings{$enc} >
1247	$guessed_encodings{$best_encoding}){
1248	$best_encoding=$enc;
1249	}
1250	}
1251	}
1252
1253	if ($self->{'input_encoding'} ne 'auto') {
1254	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1255	gsprintf($outhandle,
1256	"BasPlug: {BasPlug.could_not_extract_language}\n",
1257	$filename, $self->{'default_language'});
1258	}
1259	$best_language = $self->{'default_language'};
1260	$best_encoding = $self->{'input_encoding'};
1261
1262	} else {
1263	if ($self->{'verbosity'}>2) {
1264	gsprintf($outhandle,
1265	"BasPlug: {BasPlug.could_not_extract_language}\n",
1266	$filename, $self->{'default_language'});
1267	}
1268	$best_language = $self->{'default_language'};
1269	}
1270	} else { # <= 3 suggestions
1271	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
1272	if (!defined $language) {
1273	if ($self->{'verbosity'}>2) {
1274	gsprintf($outhandle,
1275	"BasPlug: {BasPlug.could_not_extract_language}\n",
1276	$filename, $self->{'default_language'});
1277	}
1278	$language = $self->{'default_language'};
1279	}
1280	if (!defined $encoding) {
1281	if ($self->{'verbosity'}>2) {
1282	gsprintf($outhandle,
1283	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
1284	$filename, $self->{'default_encoding'});
1285	}
1286	$encoding = $self->{'default_encoding'};
1287	}
1288	$best_language = $language;
1289	if (! $best_encoding ) { # may already be set... eg from html meta tag
1290	$best_encoding = $encoding;
1291	}
1292	}
1293
1294	my $text_copy = $text;
1295	if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1296	# the text is valid utf8, so assume that's the real encoding
1297	# (since textcat is based on probabilities)
1298	$best_encoding = 'utf8';
1299	}
1300
1301	# check for equivalents where textcat doesn't have some encodings...
1302	# eg MS versions of standard encodings
1303	if ($best_encoding =~ /^iso_8859_(\d+)/) {
1304	my $iso = $1; # which variant of the iso standard?
1305	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1306	if ($text =~ /[\x80-\x9f]/) {
1307	# Western Europe
1308	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1309	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1310	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1311	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1312	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1313	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1314	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
1315	}
1316	}
1317
1318	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1319	!defined $encodings::encodings->{$best_encoding}) {
1320	if ($self->{'verbosity'}) {
1321	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1322	$filename, $best_encoding, $self->{'default_encoding'});
1323	}
1324	$best_encoding = $self->{'default_encoding'};
1325	}
1326
1327	return ($best_language, $best_encoding);
1328	}
1329
1330	# add any extra metadata that's been passed around from one
1331	# plugin to another.
1332	# extra_metadata uses add_utf8_metadata so it expects metadata values
1333	# to already be in utf8
1334	sub extra_metadata {
1335	my $self = shift (@_);
1336	my ($doc_obj, $cursection, $metadata) = @_;
1337
1338	my $associate_tail_re = $self->{'associate_tail_re'};
1339
1340	foreach my $field (keys(%$metadata)) {
1341	# $metadata->{$field} may be an array reference
1342	if ($field eq "gsdlassocfile_tobe") {
1343	# 'gsdlassocfile_tobe' is artificially introduced metadata
1344	# that is used to signal that certain additional files should
1345	# be tied to this document. Useful in situations where a
1346	# metadata pass in the plugin pipeline works out some files
1347	# need to be associated with a document, but the document hasn't
1348	# been formed yet.
1349
1350	my $equiv_form = "";
1351	foreach my $gaf (@{$metadata->{$field}}) {
1352	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1353	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1354	my $filename = $full_filename;
1355
1356	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1357
1358	# work out extended tail extension (i.e. matching tail re)
1359
1360	my ($file_prefix,$file_extended_ext)
1361	= $self->root_ext_split($tail_filename,$associate_tail_re);
1362	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
1363
1364	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1365	my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1366	my $srcicon = "_icon".$doc_ext."_";
1367	my $end_doclink = "</a>";
1368
1369	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1370
1371	if (defined $pre_doc_ext) {
1372	# for metadata such as [mp3._edited] [mp3._full] ...
1373	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1374	}
1375
1376	# for multiple metadata such as [mp3.assoclink]
1377	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1378
1379	$equiv_form .= " $assoc_form";
1380	}
1381	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1382	}
1383	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1384	map {
1385	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1386	} @{$metadata->{$field}};
1387	} else {
1388	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1389	}
1390	}
1391	}
1392
1393	# initialise metadata extractors
1394	sub initialise_extractors {
1395	my $self = shift (@_);
1396
1397	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1398	&acronym::initialise_acronyms();
1399	}
1400	}
1401
1402	# finalise metadata extractors
1403	sub finalise_extractors {
1404	my $self = shift (@_);
1405
1406	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1407	&acronym::finalise_acronyms();
1408	}
1409	}
1410
1411	# FIRSTNNN: extract the first NNN characters as metadata
1412	sub extract_first_NNNN_characters {
1413	my $self = shift (@_);
1414	my ($textref, $doc_obj, $thissection) = @_;
1415
1416	foreach my $size (split /,/, $self->{'first'}) {
1417	my $tmptext = $$textref;
1418	$tmptext =~ s/^\s+//;
1419	$tmptext =~ s/\s+$//;
1420	$tmptext =~ s/\s+/ /gs;
1421	$tmptext = substr ($tmptext, 0, $size);
1422	$tmptext =~ s/\s\S*$/…/;
1423	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1424	}
1425	}
1426
1427	sub extract_email {
1428	my $self = shift (@_);
1429	my ($textref, $doc_obj, $thissection) = @_;
1430	my $outhandle = $self->{'outhandle'};
1431
1432	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1433	if ($self->{'verbosity'} > 2);
1434
1435	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1436	@email = sort @email;
1437
1438	# if($self->{"new_extract_email"} == 0)
1439	# {
1440	# my @email2 = ();
1441	# foreach my $address (@email)
1442	# {
1443	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
1444	# {
1445	# push @email2, $address;
1446	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1447	# # print $outhandle " extracting $address\n"
1448	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1449	# if ($self->{'verbosity'} > 3);
1450	# }
1451	# }
1452	# }
1453	# else
1454	# {
1455	my $hashExistMail = {};
1456	foreach my $address (@email) {
1457	if (!(defined $hashExistMail->{$address}))
1458	{
1459	$hashExistMail->{$address} = 1;
1460	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1461	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1462	if ($self->{'verbosity'} > 3);
1463	}
1464	}
1465	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1466	if ($self->{'verbosity'} > 2);
1467	}
1468
1469	# extract metadata
1470	sub auto_extract_metadata {
1471
1472	my $self = shift (@_);
1473	my ($doc_obj) = @_;
1474
1475	if ($self->{'extract_email'}) {
1476	my $thissection = $doc_obj->get_top_section();
1477	while (defined $thissection) {
1478	my $text = $doc_obj->get_text($thissection);
1479	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1480	$thissection = $doc_obj->get_next_section ($thissection);
1481	}
1482	}
1483	if ($self->{'extract_placenames'}) {
1484	my $thissection = $doc_obj->get_top_section();
1485	while (defined $thissection) {
1486	my $text = $doc_obj->get_text($thissection);
1487	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1488	$thissection = $doc_obj->get_next_section ($thissection);
1489	}
1490	}
1491
1492	if ($self->{'extract_keyphrases'} \|\| $self->{'extract_keyphrases_kea4'}) {
1493	$self->extract_keyphrases($doc_obj);
1494	}
1495
1496	if ($self->{'first'}) {
1497	my $thissection = $doc_obj->get_top_section();
1498	while (defined $thissection) {
1499	my $text = $doc_obj->get_text($thissection);
1500	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1501	$thissection = $doc_obj->get_next_section ($thissection);
1502	}
1503	}
1504
1505	if ($self->{'extract_acronyms'}) {
1506	my $thissection = $doc_obj->get_top_section();
1507	while (defined $thissection) {
1508	my $text = $doc_obj->get_text($thissection);
1509	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1510	$thissection = $doc_obj->get_next_section ($thissection);
1511	}
1512	}
1513
1514	if ($self->{'markup_acronyms'}) {
1515	my $thissection = $doc_obj->get_top_section();
1516	while (defined $thissection) {
1517	my $text = $doc_obj->get_text($thissection);
1518	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1519	$doc_obj->delete_text($thissection);
1520	$doc_obj->add_text($thissection, $text);
1521	$thissection = $doc_obj->get_next_section ($thissection);
1522	}
1523	}
1524
1525	if($self->{'extract_historical_years'}) {
1526	my $thissection = $doc_obj->get_top_section();
1527	while (defined $thissection) {
1528
1529	my $text = $doc_obj->get_text($thissection);
1530	&DateExtract::get_date_metadata($text, $doc_obj,
1531	$thissection,
1532	$self->{'no_bibliography'},
1533	$self->{'maximum_year'},
1534	$self->{'maximum_century'});
1535	$thissection = $doc_obj->get_next_section ($thissection);
1536	}
1537	}
1538	}
1539
1540
1541	#adding kea keyphrases
1542	sub extract_keyphrases
1543	{
1544	my $self = shift(@_);
1545	my $doc_obj = shift(@_);
1546
1547	# Use Kea 3.0 unless 4.0 has been specified
1548	my $kea_version = "3.0";
1549	if ($self->{'extract_keyphrases_kea4'}) {
1550	$kea_version = "4.0";
1551	}
1552
1553	# Check that Kea exists, and tell the user where to get it if not
1554	my $keahome = &Kea::get_Kea_directory($kea_version);
1555	if (!-e $keahome) {
1556	gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1557	return;
1558	}
1559
1560	my $thissection = $doc_obj->get_top_section();
1561	my $text = "";
1562	my $list;
1563
1564	#loop through sections to gather whole doc
1565	while (defined $thissection) {
1566	my $sectiontext = $doc_obj->get_text($thissection);
1567	$text = $text.$sectiontext;
1568	$thissection = $doc_obj->get_next_section ($thissection);
1569	}
1570
1571	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1572	$list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1573	} else { #otherwise call Kea with no options
1574	$list = &Kea::extract_KeyPhrases ($kea_version, $text);
1575	}
1576
1577	if ($list){
1578	# if a list of kea keyphrases was returned (ie not empty)
1579	if ($self->{'verbosity'}) {
1580	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1581	}
1582
1583	#add metadata to top section
1584	$thissection = $doc_obj->get_top_section();
1585
1586	# add all key phrases as one metadata
1587	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1588
1589	# add individual key phrases as multiple metadata
1590	foreach my $keyphrase (split(',', $list)) {
1591	$keyphrase =~ s/^\s+\|\s+$//g;
1592	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1593	}
1594	}
1595	}
1596
1597
1598	# extract acronyms from a section in a document. progress is
1599	# reported to outhandle based on the verbosity. both the Acronym
1600	# and the AcronymKWIC metadata items are created.
1601
1602	sub extract_acronyms {
1603	my $self = shift (@_);
1604	my ($textref, $doc_obj, $thissection) = @_;
1605	my $outhandle = $self->{'outhandle'};
1606
1607	# print $outhandle " extracting acronyms ...\n"
1608	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1609	if ($self->{'verbosity'} > 2);
1610
1611	my $acro_array = &acronym::acronyms($textref);
1612
1613	foreach my $acro (@$acro_array) {
1614
1615	#check that this is the first time ...
1616	my $seen_before = "false";
1617	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1618	foreach my $thisAcro (@$previous_data) {
1619	if ($thisAcro eq $acro->to_string()) {
1620	$seen_before = "true";
1621	if ($self->{'verbosity'} >= 4) {
1622	gsprintf($outhandle, " {BasPlug.already_seen} " .
1623	$acro->to_string() . "\n");
1624	}
1625	}
1626	}
1627
1628	if ($seen_before eq "false") {
1629	#write it to the file ...
1630	$acro->write_to_file();
1631
1632	#do the normal acronym
1633	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1634	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1635	if ($self->{'verbosity'} > 3);
1636	}
1637	}
1638
1639	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1640	if ($self->{'verbosity'} > 2);
1641	}
1642
1643	sub markup_acronyms {
1644	my $self = shift (@_);
1645	my ($text, $doc_obj, $thissection) = @_;
1646	my $outhandle = $self->{'outhandle'};
1647
1648	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1649	if ($self->{'verbosity'} > 2);
1650
1651	#self is passed in to check for verbosity ...
1652	$text = &acronym::markup_acronyms($text, $self);
1653
1654	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1655	if ($self->{'verbosity'} > 2);
1656
1657	return $text;
1658	}
1659
1660	sub compile_stats {
1661	my $self = shift(@_);
1662	my ($stats) = @_;
1663
1664	$stats->{'num_processed'} += $self->{'num_processed'};
1665	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1666	$stats->{'num_archives'} += $self->{'num_archives'};
1667
1668	}
1669
1670	sub associate_cover_image {
1671	my $self = shift;
1672	my ($doc_obj, $filename) = @_;
1673
1674	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1675	if (exists $self->{'covers_missing_cache'}->{$filename}) {
1676	# don't stat() for existence eg for multiple document input files
1677	# (eg SplitPlug)
1678	return;
1679	}
1680
1681	my $top_section=$doc_obj->get_top_section();
1682
1683	if (-e $filename) {
1684	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1685	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1686	} else {
1687	my $upper_filename = $filename;
1688	$upper_filename =~ s/jpg$/JPG/;
1689	if (-e $upper_filename) {
1690	$doc_obj->associate_file($upper_filename, "cover.jpg",
1691	"image/jpeg");
1692	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1693	} else {
1694	# file doesn't exist, so record the fact that it's missing so
1695	# we don't stat() again (stat is slow)
1696	$self->{'covers_missing_cache'}->{$filename} = 1;
1697	}
1698	}
1699
1700	}
1701
1702
1703	# Overridden by exploding plugins (eg. ISISPlug)
1704	sub clean_up_after_exploding
1705	{
1706	my $self = shift(@_);
1707	}
1708
1709
1710	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: