source: gsdl/trunk/perllib/plugins/BasPlug.pm@ 15018

Last change on this file since 15018 was 15018, checked in by davidb, 14 years ago

Marc mapping upgraded to support richer set of operations, including subfields, multiple fields in one line (separated by comma), and the removal of rules, e.g. -245 at the start of a line. A Marc to Qualified Dublin Core crosswalk from the Library of congress has been added as "etc/marc2qdc.txt". A collection can then choose to, for example, top up the mapping with its own version of the file stored in its local "etc" folder, specifying only the rules that are different. This is where a rule like "-245" might be used to override a more general rule from the main file that has all subfields in 245 mapping to one metadata item (Title). If the user specifies a different different filename -- through a plugin option -- then they are free to divise a mapping from scratch and store it in the collections local "etc" folder.

  • Property svn:keywords set to Author Date Id Revision
File size: 49.4 KB
Line 
1###########################################################################
2#
3# BasPlug.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2005 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package BasPlug;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30}
31
32eval {require bytes};
33
34# suppress the annoying "subroutine redefined" warning that various
35# plugins cause under perl 5.6
36$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38use strict;
39no strict 'subs';
40no strict 'refs'; # allow filehandles to be variables and viceversa
41
42use File::Basename;
43
44use Kea;
45use multiread;
46use encodings;
47use unicode;
48use cnseg;
49use acronym;
50use textcat;
51use doc;
52eval "require diagnostics"; # some perl distros (eg mac) don't have this
53use DateExtract;
54use ghtml;
55use gsprintf 'gsprintf';
56use printusage;
57use parse2;
58
59
60use GISBasPlug;
61
62@BasPlug::ISA = ( GISBasPlug );
63
64my $unicode_list =
65 [ { 'name' => "ascii",
66 'desc' => "{BasPlug.input_encoding.ascii}" },
67 { 'name' => "utf8",
68 'desc' => "{BasPlug.input_encoding.utf8}" },
69 { 'name' => "unicode",
70 'desc' => "{BasPlug.input_encoding.unicode}" } ];
71
72my $auto_unicode_list =
73 [ { 'name' => "auto",
74 'desc' => "{BasPlug.input_encoding.auto}" } ];
75
76my $e = $encodings::encodings;
77foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78{
79 my $hashEncode =
80 {'name' => $enc,
81 'desc' => $e->{$enc}->{'name'}};
82
83 push(@{$unicode_list},$hashEncode);
84}
85
86push(@{$auto_unicode_list},@{$unicode_list});
87
88my $arguments =
89 [ { 'name' => "process_exp",
90 'desc' => "{BasPlug.process_exp}",
91 'type' => "regexp",
92 'deft' => "",
93 'reqd' => "no" },
94 { 'name' => "block_exp",
95 'desc' => "{BasPlug.block_exp}",
96 'type' => "regexp",
97 'deft' => "",
98 'reqd' => "no" },
99 { 'name' => "smart_block",
100 'desc' => "{BasPlug.smart_block}",
101 'type' => "flag",
102 'reqd' => "no" },
103 { 'name' => "associate_ext",
104 'desc' => "{BasPlug.associate_ext}",
105 'type' => "string",
106 'reqd' => "no" },
107 { 'name' => "associate_tail_re",
108 'desc' => "{BasPlug.associate_tail_re}",
109 'type' => "string",
110 'reqd' => "no" },
111 { 'name' => "use_as_doc_identifier",
112 'desc' => "{BasPlug.use_as_doc_identifier}",
113 'type' => "string",
114 'reqd' => "no" ,
115 'deft' => "" } ,
116 { 'name' => "input_encoding",
117 'desc' => "{BasPlug.input_encoding}",
118 'type' => "enum",
119 'list' => $auto_unicode_list,
120 'reqd' => "no" ,
121 'deft' => "auto" } ,
122 { 'name' => "default_encoding",
123 'desc' => "{BasPlug.default_encoding}",
124 'type' => "enum",
125 'list' => $unicode_list,
126 'reqd' => "no",
127 'deft' => "utf8" },
128 { 'name' => "extract_language",
129 'desc' => "{BasPlug.extract_language}",
130 'type' => "flag",
131 'reqd' => "no" },
132 { 'name' => "default_language",
133 'desc' => "{BasPlug.default_language}",
134 'type' => "string",
135 'deft' => "en",
136 'reqd' => "no" },
137 { 'name' => "extract_acronyms",
138 'desc' => "{BasPlug.extract_acronyms}",
139 'type' => "flag",
140 'reqd' => "no" },
141 { 'name' => "markup_acronyms",
142 'desc' => "{BasPlug.markup_acronyms}",
143 'type' => "flag",
144 'reqd' => "no" },
145 { 'name' => "extract_keyphrases",
146 'desc' => "{BasPlug.extract_keyphrases}",
147 'type' => "flag",
148 'reqd' => "no" },
149 { 'name' => "extract_keyphrases_kea4",
150 'desc' => "{BasPlug.extract_keyphrases_kea4}",
151 'type' => "flag",
152 'reqd' => "no" },
153 { 'name' => "extract_keyphrase_options",
154 'desc' => "{BasPlug.extract_keyphrase_options}",
155 'type' => "string",
156 'deft' => "",
157 'reqd' => "no" },
158 { 'name' => "first",
159 'desc' => "{BasPlug.first}",
160 'type' => "string",
161 'reqd' => "no" },
162 { 'name' => "extract_email",
163 'desc' => "{BasPlug.extract_email}",
164 'type' => "flag",
165 'reqd' => "no" },
166 { 'name' => "extract_historical_years",
167 'desc' => "{BasPlug.extract_historical_years}",
168 'type' => "flag",
169 'reqd' => "no" },
170 { 'name' => "maximum_year",
171 'desc' => "{BasPlug.maximum_year}",
172 'type' => "int",
173 'deft' => (localtime)[5]+1900,
174 'char_length' => "4",
175 #'range' => "2,100",
176 'reqd' => "no"},
177 { 'name' => "maximum_century",
178 'desc' => "{BasPlug.maximum_century}",
179 'type' => "string",
180 'deft' => "-1",
181 'reqd' => "no" },
182 { 'name' => "no_bibliography",
183 'desc' => "{BasPlug.no_bibliography}",
184 'type' => "flag",
185 'reqd' => "no"},
186 { 'name' => "no_cover_image",
187 'desc' => "{BasPlug.no_cover_image}",
188 'type' => "flag",
189 'reqd' => "no" },
190 { 'name' => "separate_cjk",
191 'desc' => "{BasPlug.separate_cjk}",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "yes" },
195 { 'name' => "new_extract_email",
196 'desc' => "",
197 'type' => "flag",
198 'reqd' => "no",
199 'hiddengli' => "yes" } ];
200
201my $gis_arguments =
202 [ { 'name' => "extract_placenames",
203 'desc' => "{GISBasPlug.extract_placenames}",
204 'type' => "flag",
205 'reqd' => "no" },
206 { 'name' => "gazetteer",
207 'desc' => "{GISBasPlug.gazetteer}",
208 'type' => "string",
209 'reqd' => "no" },
210 { 'name' => "place_list",
211 'desc' => "{GISBasPlug.place_list}",
212 'type' => "flag",
213 'reqd' => "no" } ];
214
215
216my $options = { 'name' => "BasPlug",
217 'desc' => "{BasPlug.desc}",
218 'abstract' => "yes",
219 'inherits' => "no",
220 'args' => $arguments };
221
222
223sub set_incremental {
224 my $self = shift(@_);
225 my ($incremental) = @_;
226
227 $self->{'incremental'} = $incremental;
228}
229
230sub get_arguments
231{
232 my $self = shift(@_);
233 my $optionlistref = $self->{'option_list'};
234 my @optionlist = @$optionlistref;
235 my $pluginoptions = pop(@$optionlistref);
236 my $pluginarguments = $pluginoptions->{'args'};
237 return $pluginarguments;
238}
239
240
241sub print_xml_usage
242{
243 my $self = shift(@_);
244 my $header = shift(@_);
245 my $high_level_information_only = shift(@_);
246
247 # XML output is always in UTF-8
248 gsprintf::output_strings_in_UTF8;
249
250 if ($header) {
251 &PrintUsage::print_xml_header("plugin");
252 }
253 $self->print_xml($high_level_information_only);
254}
255
256
257sub print_xml
258{
259 my $self = shift(@_);
260 my $high_level_information_only = shift(@_);
261
262 my $optionlistref = $self->{'option_list'};
263 my @optionlist = @$optionlistref;
264 my $pluginoptions = shift(@$optionlistref);
265 return if (!defined($pluginoptions));
266
267 # Find the process and block default expressions in the plugin arguments
268 my $process_exp = "";
269 my $block_exp = "";
270 if (defined($pluginoptions->{'args'})) {
271 foreach my $option (@{$pluginoptions->{'args'}}) {
272 if ($option->{'name'} eq "process_exp") {
273 $process_exp = $option->{'deft'};
274 }
275 if ($option->{'name'} eq "block_exp") {
276 $block_exp = $option->{'deft'};
277 }
278 }
279 }
280
281 gsprintf(STDERR, "<PlugInfo>\n");
282 gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
283 my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
284 $desc =~ s/</&amp;lt;/g; # doubly escaped
285 $desc =~ s/>/&amp;gt;/g;
286 gsprintf(STDERR, " <Desc>$desc</Desc>\n");
287 gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
288 gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
289 gsprintf(STDERR, " <Processes>$process_exp</Processes>\n");
290 gsprintf(STDERR, " <Blocks>$block_exp</Blocks>\n");
291 gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} || "no") . "</Explodes>\n");
292 unless (defined($high_level_information_only)) {
293 gsprintf(STDERR, " <Arguments>\n");
294 if (defined($pluginoptions->{'args'})) {
295 &PrintUsage::print_options_xml($pluginoptions->{'args'});
296 }
297 gsprintf(STDERR, " </Arguments>\n");
298
299 # Recurse up the plugin hierarchy
300 $self->print_xml();
301 }
302 gsprintf(STDERR, "</PlugInfo>\n");
303}
304
305
306sub print_txt_usage
307{
308 my $self = shift(@_);
309 # Print the usage message for a plugin (recursively)
310 my $descoffset = $self->determine_description_offset(0);
311 $self->print_plugin_usage($descoffset, 1);
312}
313
314
315sub determine_description_offset
316{
317 my $self = shift(@_);
318 my $maxoffset = shift(@_);
319
320 my $optionlistref = $self->{'option_list'};
321 my @optionlist = @$optionlistref;
322 my $pluginoptions = shift(@$optionlistref);
323 return $maxoffset if (!defined($pluginoptions));
324
325 # Find the length of the longest option string of this plugin
326 my $pluginargs = $pluginoptions->{'args'};
327 if (defined($pluginargs)) {
328 my $longest = &PrintUsage::find_longest_option_string($pluginargs);
329 if ($longest > $maxoffset) {
330 $maxoffset = $longest;
331 }
332 }
333
334 # Recurse up the plugin hierarchy
335 $maxoffset = $self->determine_description_offset($maxoffset);
336 $self->{'option_list'} = \@optionlist;
337 return $maxoffset;
338}
339
340
341sub print_plugin_usage
342{
343 my $self = shift(@_);
344 my $descoffset = shift(@_);
345 my $isleafclass = shift(@_);
346
347 my $optionlistref = $self->{'option_list'};
348 my @optionlist = @$optionlistref;
349 my $pluginoptions = shift(@$optionlistref);
350 return if (!defined($pluginoptions));
351
352 my $pluginname = $pluginoptions->{'name'};
353 my $pluginargs = $pluginoptions->{'args'};
354 my $plugindesc = $pluginoptions->{'desc'};
355
356 # Produce the usage information using the data structure above
357 if ($isleafclass) {
358 if (defined($plugindesc)) {
359 gsprintf(STDERR, "$plugindesc\n\n");
360 }
361 gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
362 }
363
364 # Display the plugin options, if there are some
365 if (defined($pluginargs)) {
366 # Calculate the column offset of the option descriptions
367 my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
368
369 if ($isleafclass) {
370 gsprintf(STDERR, " {common.specific_options}:\n");
371 }
372 else {
373 gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
374 }
375
376 # Display the plugin options
377 &PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
378 }
379
380 # Recurse up the plugin hierarchy
381 $self->print_plugin_usage($descoffset, 0);
382 $self->{'option_list'} = \@optionlist;
383}
384
385
386sub new {
387 # Set Encodings to the list!!
388
389
390 # Start the BasPlug Constructor
391 my $class = shift (@_);
392 my ($pluginlist,$args,$hashArgOptLists) = @_;
393 push(@$pluginlist, $class);
394 my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
395
396 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
397 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
398
399 if (GISBasPlug::has_mapdata()) {
400 push(@$arguments,@$gis_arguments);
401 }
402
403 my $self = {};
404 $self->{'outhandle'} = STDERR;
405 $self->{'option_list'} = $hashArgOptLists->{"OptList"};
406 $self->{"info_only"} = 0;
407
408 # Check if gsdlinfo is in the argument list or not - if it is, don't parse
409 # the args, just return the object.
410 foreach my $strArg (@{$args})
411 {
412 if($strArg eq "-gsdlinfo")
413 {
414 $self->{"info_only"} = 1;
415 return bless $self, $class;
416 }
417 }
418
419 if(parse2::parse($args,$hashArgOptLists->{"ArgList"},$self) == -1)
420 {
421 my $classTempClass = bless $self, $class;
422 print STDERR "<BadPlugin p=$plugin_name>\n";
423 &gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
424 $classTempClass->print_txt_usage(""); # Use default resource bundle
425 die "\n";
426 }
427
428
429 delete $self->{"info_only"};
430 # else parsing was successful.
431
432 $self->{'plugin_type'} = $plugin_name;
433 #$self->{'outhandle'} = STDERR;
434 $self->{'num_processed'} = 0;
435 $self->{'num_not_processed'} = 0;
436 $self->{'num_blocked'} = 0;
437 $self->{'num_archives'} = 0;
438 $self->{'cover_image'} = 1; # cover image is on by default
439 $self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
440 #$self->{'option_list'} = $hashArgOptLists->{"OptList"};
441
442 my $associate_ext = $self->{'associate_ext'};
443 if ((defined $associate_ext) && ($associate_ext ne "")) {
444
445 my $associate_tail_re = $self->{'associate_tail_re'};
446 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
447 my $outhandle = $self->{'outhandle'};
448 print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
449 print $outhandle " defaulting to 'associate_tail_re'\n";
450 }
451 else {
452 my @exts = split(/,/,$associate_ext);
453
454 my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
455 my $associate_tail_re = join("|",@exts_bracketed);
456 $self->{'associate_tail_re'} = $associate_tail_re;
457 }
458
459 delete $self->{'associate_ext'};
460 }
461
462 $self->{'shared_fileroot'} = {};
463 $self->{'file_blocks'} = {};
464
465 if ($self->{'extract_placenames'}) {
466
467 my $outhandle = $self->{'outhandle'};
468
469 my $places_ref
470 = GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
471
472 if (!defined $places_ref) {
473 print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
474 print $outhandle " No placename extraction will take place.\n";
475 $self->{'extract_placenames'} = undef;
476 }
477 else {
478 $self->{'places'} = $places_ref;
479 }
480 }
481
482 return bless $self, $class;
483
484}
485
486# initialize BasPlug options
487# if init() is overridden in a sub-class, remember to call BasPlug::init()
488sub init {
489 my $self = shift (@_);
490 my ($verbosity, $outhandle, $failhandle) = @_;
491
492 # verbosity is passed through from the processor
493 $self->{'verbosity'} = $verbosity;
494
495 # as are the outhandle and failhandle
496 $self->{'outhandle'} = $outhandle if defined $outhandle;
497 $self->{'failhandle'} = $failhandle;
498
499 # set process_exp and block_exp to defaults unless they were
500 # explicitly set
501
502 if ((!$self->is_recursive()) and
503 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
504
505 $self->{'process_exp'} = $self->get_default_process_exp ();
506 if ($self->{'process_exp'} eq "") {
507 warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
508 }
509 }
510
511 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) {
512 $self->{'block_exp'} = $self->get_default_block_exp ();
513 }
514
515}
516
517sub begin {
518 my $self = shift (@_);
519 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
520 $self->initialise_extractors();
521}
522
523sub end {
524 # potentially called at the end of each plugin pass
525 # import.pl only has one plugin pass, but buildcol.pl has multiple ones
526
527 my ($self) = @_;
528 $self->finalise_extractors();
529}
530
531sub deinit {
532 # called only once, after all plugin passes have been done
533
534 my ($self) = @_;
535}
536
537# this function should be overridden to return 1
538# in recursive plugins
539sub is_recursive {
540 my $self = shift (@_);
541
542 return 0;
543}
544
545sub get_default_block_exp {
546 my $self = shift (@_);
547
548 return "";
549}
550
551sub get_default_process_exp {
552 my $self = shift (@_);
553
554 return "";
555}
556
557# default implementation is to do nothing.
558sub store_block_files
559{
560 my $self =shift (@_);
561 my ($filename) = @_;
562 return;
563}
564
565#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
566sub block_cover_image
567{
568 my $self =shift;
569 my $filename = shift;
570
571 if ($self->{'cover_image'}) {
572 my $coverfile = $filename;
573 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
574 if (!-e $coverfile) {
575 $coverfile =~ s/jpg$/JPG/;
576 }
577 if (-e $coverfile) {
578 $self->{'file_blocks'}->{$coverfile} = 1;
579 }
580 }
581
582 return;
583}
584
585sub root_ext_split
586{
587 my $self = shift (@_);
588 my ($filename,$tail_re) = @_;
589
590 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
591
592 if ((!defined $file_prefix) || (!defined $file_ext)) {
593 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
594 }
595
596 return ($file_prefix,$file_ext);
597}
598
599sub metadata_read {
600 my $self = shift (@_);
601 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
602 # Keep track of filenames with same root but different extensions
603 # Used to support -associate_ext and the more generalised
604 # -associate_tail_re
605
606 my $associate_tail_re = $self->{'associate_tail_re'};
607 if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
608
609 my ($file_prefix,$file_ext)
610 = $self->root_ext_split($file,$associate_tail_re);
611
612 if ((defined $file_prefix) && (defined $file_ext)) {
613
614 my $shared_fileroot = $self->{'shared_fileroot'};
615 if (!defined $shared_fileroot->{$file_prefix}) {
616 my $file_prefix_rec = { 'tie_to' => undef,
617 'exts' => {} };
618 $shared_fileroot->{$file_prefix} = $file_prefix_rec;
619 }
620
621 my $file_prefix_rec = $shared_fileroot->{$file_prefix};
622
623 my $process_exp = $self->{'process_exp'};
624
625 if ($file =~ m/$process_exp/) {
626 # This is the document the others should be tied to
627 $file_prefix_rec->{'tie_to'} = $file_ext;
628 }
629 else {
630 if ($file_ext =~ m/$associate_tail_re$/) {
631 $file_prefix_rec->{'exts'}->{$file_ext} = 1;
632 }
633 }
634
635 }
636 }
637
638 # now check whether we are actually processing this
639 my $filename = $file;
640 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
641 if ($self->{'process_exp'} eq "" || $filename !~ /$self->{'process_exp'}/ || !-f $filename) {
642 return undef; # can't recognise
643 }
644
645 # do smart blocking if appropriate
646 if ($self->{'smart_block'}) {
647 $self->store_block_files($filename);
648 }
649 # block the cover image if there is one
650 if ($self->{'cover_image'}) {
651 $self->block_cover_image($filename);
652 }
653
654 return 1;
655}
656
657sub tie_to_filename
658{
659 my $self = shift (@_);
660
661 my ($file_ext,$file_prefix_rec) = @_;
662
663 if (defined $file_prefix_rec) {
664 my $tie_to = $file_prefix_rec->{'tie_to'};
665
666 if (defined $tie_to) {
667 if ($tie_to eq $file_ext) {
668 return 1;
669 }
670 }
671 }
672
673 return 0;
674}
675
676sub tie_to_assoc_file
677{
678 my $self = shift (@_);
679 my ($file_ext,$file_prefix_rec) = @_;
680
681 if (defined $file_prefix_rec) {
682 my $tie_to = $file_prefix_rec->{'tie_to'};
683 if (defined $tie_to) {
684
685 my $exts = $file_prefix_rec->{'exts'};
686
687 my $has_file_ext = $exts->{$file_ext};
688
689 if ($has_file_ext) {
690 return 1;
691 }
692 }
693 }
694
695 return 0;
696}
697
698
699sub associate_with
700{
701 my $self = shift (@_);
702 my ($file, $filename, $metadata) = @_;
703
704 my $associate_tail_re = $self->{'associate_tail_re'};
705 return 0 if (!$associate_tail_re);
706
707 # If file, see if matches with "tie_to" doc or is one of the
708 # associated filename extensions.
709
710 my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
711
712 if ((defined $file_prefix) && (defined $file_ext)) {
713
714 my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
715
716 if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
717
718 # Set up gsdlassocfile_tobe
719
720 my $exts = $file_prefix_rec->{'exts'};
721
722 if (!defined $metadata->{'gsdlassocfile_tobe'}) {
723 $metadata->{'gsdlassocfile_tobe'} = [];
724 }
725
726 my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
727
728 my ($full_prefix) = ($filename =~ m/^(.*)\..*?$/);
729 foreach my $e (keys %$exts) {
730 my $assoc_file = "$full_prefix$e";
731 print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
732 my $mime_type = ""; # let system auto detect this
733 push(@$assoc_tobe,"$assoc_file:$mime_type:");
734 }
735
736 }
737 elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
738
739
740 # a form of smart block
741 return 1;
742 }
743 }
744
745 return 0;
746}
747
748
749sub read_block {
750 my $self = shift (@_);
751
752 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
753
754
755 my $filename = $file;
756 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
757
758 if ($self->associate_with($file,$filename,$metadata)) {
759 # a form of smart block
760 $self->{'num_blocked'} ++;
761 return (0,undef); # blocked
762 }
763
764 my $smart_block = $self->{'smart_block'};
765 my $smart_block_BN = $self->{'smart_block_BN'};
766
767 if ($smart_block || $smart_block_BN) {
768 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
769 $self->{'num_blocked'} ++;
770 return (0,undef); # blocked
771 }
772 } else {
773 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
774 $self->{'num_blocked'} ++;
775 return (0,undef); # blocked
776 }
777 if ($self->{'cover_image'}) {
778 if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
779 $self->{'num_blocked'} ++;
780 return (0,undef); # blocked
781 }
782 }
783 }
784
785 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
786 return (undef,undef); # can't recognise
787 }
788
789 return (1,$filename);
790}
791
792sub read_tidy_file {
793
794 my $self = shift (@_);
795
796 my ($file) = @_;
797
798 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
799
800 return $file;
801}
802
803
804sub filename_to_metadata
805{
806 my $self = shift (@_);
807 my ($file, $encoding) = @_;
808
809 my $outhandle = $self->{'outhandle'};
810
811 my $filesystem_encoding = undef;
812
813 eval {
814 use POSIX qw(locale_h);
815
816 # With only one parameter, setlocale retrieves the current value
817 my $current_locale = setlocale(LC_CTYPE);
818
819 if ($current_locale =~ m/^.*\.(.*?)$/) {
820 my $char_encoding = lc($1);
821 $char_encoding =~ s/-/_/g;
822 $char_encoding =~ s/^utf_8$/utf8/;
823
824 if ($char_encoding =~ m/^\d+$/) {
825 if (defined $encodings::encoding->{"windows_$char_encoding"}) {
826 $char_encoding = "windows_$char_encoding";
827 }
828 elsif (defined $encodings::encoding->{"dos_$char_encoding"}) {
829 $char_encoding = "dos_$char_encoding";
830 }
831 }
832
833 if (($char_encoding =~ m/(?:ascii|utf8|unicode)/)
834 || (defined $encodings::encoding->{$char_encoding})) {
835 $filesystem_encoding = $char_encoding;
836 }
837 else {
838 print $outhandle "Warning: Unsupported character encoding '$char_encoding' from locale '$current_locale'\n";
839 }
840 }
841
842
843 };
844 if ($@) {
845 print $outhandle "$@\n";
846 print $outhandle "Warning: Unable to establish locale. Will assume filesytem is UTF-8\n";
847
848 }
849
850 my ($filemeta) = $file =~ /([^\\\/]+)$/;
851
852 # how do we know what encoding the filename is in?
853 # => one answer is to check the locale
854
855 if (defined $filesystem_encoding) {
856 if ($filesystem_encoding !~ /(?:ascii|utf8|unicode)/) {
857 $filemeta = unicode::unicode2utf8(
858 unicode::convert2unicode($filesystem_encoding, \$filemeta)
859 );
860 }
861 }
862 # assume it is in the same encoding as its contents
863 elsif ((defined $encoding) && ($encoding !~ /(?:ascii|utf8|unicode)/)) {
864 $filemeta = unicode::unicode2utf8(
865 unicode::convert2unicode($encoding, \$filemeta)
866 );
867 }
868
869 my $dmsafe_filemeta = &ghtml::dmsafe($filemeta);
870
871 return $dmsafe_filemeta;
872}
873
874
875sub add_OID
876{
877 my $self = shift (@_);
878 my ($doc_obj) = @_;
879
880 # See if a metadata field is specified as the field
881 if ((defined $self->{'use_as_doc_identifier'}) && ($self->{'use_as_doc_identifier'} ne "")) {
882 my $metadata_doc_id = $self->{'use_as_doc_identifier'};
883
884 # Consider "tidying" up metadata_doc_id to be something
885 # suitable in a URL
886 # Could even support a user specified plugin RE for this.
887
888 my $top_section = $doc_obj->get_top_section();
889 my $oid = $doc_obj->get_metadata_element($top_section,$metadata_doc_id);
890## print STDERR "**** oid = $oid\n";
891 $doc_obj->set_OID($oid);
892 }
893 # See if there is a plugin-specific set_OID function...
894 elsif (defined ($self->can('set_OID'))) {
895 # it will need $doc_obj to set the Identifier metadata...
896 $self->set_OID(@_); # pass through any extra arguments supplied
897 } else {
898 # use the default set_OID() in doc.pm
899 $doc_obj->set_OID();
900 }
901}
902
903# The BasPlug read_into_doc_obj() function. This function does all the
904# right things to make general options work for a given plugin. It reads in
905# a file and sets up a slew of metadata all saved in doc_obj, which
906# it then returns as part of a tuple (process_status,doc_obj)
907#
908# Much of this functionality used to reside in read, but it was broken
909# down into a supporting routine to make the code more flexible.
910#
911# recursive plugins (e.g. RecPlug) and specialized plugins like those
912# capable of processing many documents within a single file (e.g.
913# GMLPlug) will normally want to implement their own version of
914# read_into_doc_obj()
915#
916# Note that $base_dir might be "" and that $file might
917# include directories
918sub read_into_doc_obj {
919 my $self = shift (@_);
920 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
921
922 if ($self->is_recursive()) {
923 gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
924 }
925
926 my $outhandle = $self->{'outhandle'};
927
928 my ($block_status,$filename) = $self->read_block(@_);
929 return $block_status if ((!defined $block_status) || ($block_status==0));
930 $file = $self->read_tidy_file($file);
931
932 # Do encoding stuff
933 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
934 if ($self->{'verbosity'} > 2) {
935 print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
936 }
937
938 # create a new document
939 my $doc_obj = new doc ($filename, "indexed_doc");
940 my $top_section = $doc_obj->get_top_section();
941
942 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
943 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
944 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
945 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
946 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename));
947
948 my $filemeta = $self->filename_to_metadata($file,$encoding);
949 $doc_obj->add_utf8_metadata($top_section, "Source", $filemeta);
950 if ($self->{'cover_image'}) {
951 $self->associate_cover_image($doc_obj, $filename);
952 }
953
954 # read in file ($text will be in utf8)
955 my $text = "";
956 $self->read_file ($filename, $encoding, $language, \$text);
957
958 if (!length ($text)) {
959 my $plugin_name = ref ($self);
960 if ($gli) {
961 print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
962 }
963 gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
964
965 my $failhandle = $self->{'failhandle'};
966 gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
967 # print $failhandle "$file: " . ref($self) . ": file contains no text\n";
968 $self->{'num_not_processed'} ++;
969
970 return (0,undef); # what should we return here?? error but don't want to pass it on
971 }
972
973 # include any metadata passed in from previous plugins
974 # note that this metadata is associated with the top level section
975
976 my $associate_tail_re = $self->{'associate_tail_re'};
977
978 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
979
980 # do plugin specific processing of doc_obj
981 unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
982 $text = '';
983 undef $text;
984 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
985 return (-1,undef);
986 }
987 $text='';
988 undef $text;
989
990 # do any automatic metadata extraction
991 $self->auto_extract_metadata ($doc_obj);
992
993 $self->add_OID($doc_obj);
994
995 return (1,$doc_obj);
996}
997
998
999# The BasPlug read() function. This function calls read_into_doc_obj()
1000# to ensure all the right things to make general options work for a
1001# given plugin are done. It then calls the process() function which
1002# does all the work specific to a plugin (like the old read functions
1003# used to do). Most plugins should define their own process() function
1004# and let this read() function keep control.
1005#
1006# recursive plugins (e.g. RecPlug) and specialized plugins like those
1007# capable of processing many documents within a single file (e.g.
1008# GMLPlug) might want to implement their own version of read(), but
1009# more likely need to implement their own version of read_into_doc_obj()
1010#
1011# Return number of files processed, undef if can't recognise, -1 if can't
1012# process
1013
1014sub read {
1015 my $self = shift (@_);
1016 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
1017
1018 my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
1019
1020 if ((defined $process_status) && ($process_status == 1)) {
1021 # process the document
1022 $processor->process($doc_obj);
1023
1024 if(defined($self->{'places_filename'})){
1025 &util::rm($self->{'places_filename'});
1026 $self->{'places_filename'} = undef;
1027 }
1028
1029 $self->{'num_processed'} ++;
1030 undef $doc_obj;
1031 }
1032
1033 # if process_status == 1, then the file has been processed.
1034 return $process_status;
1035
1036}
1037
1038# returns undef if file is rejected by the plugin
1039sub process {
1040 my $self = shift (@_);
1041 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
1042
1043 gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
1044 # die "Basplug::process function must be implemented in sub-class\n";
1045
1046 return undef; # never gets here
1047}
1048
1049# uses the multiread package to read in the entire file pointed to
1050# by filename and loads the resulting text into $$textref. Input text
1051# may be in any of the encodings handled by multiread, output text
1052# will be in utf8
1053sub read_file {
1054 my $self = shift (@_);
1055 my ($filename, $encoding, $language, $textref) = @_;
1056
1057 if (!-r $filename)
1058 {
1059 my $outhandle = $self->{'outhandle'};
1060 gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
1061 # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
1062 return;
1063 }
1064 $$textref = "";
1065 if (!open (FILE, $filename)) {
1066 gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1067 die "\n";
1068 }
1069
1070 if ($encoding eq "ascii") {
1071 undef $/;
1072 $$textref = <FILE>;
1073 $/ = "\n";
1074 } else {
1075 my $reader = new multiread();
1076 $reader->set_handle ('BasPlug::FILE');
1077 $reader->set_encoding ($encoding);
1078 $reader->read_file ($textref);
1079 #Now segments chinese if the separate_cjk option is set
1080 if ($self->{'separate_cjk'}) {
1081 # segment the Chinese words
1082 $$textref = &cnseg::segment($$textref);
1083 }
1084 }
1085 close FILE;
1086}
1087
1088# write_file -- used by ConvertToPlug, for example in post processing
1089#
1090sub utf8_write_file {
1091 my $self = shift (@_);
1092 my ($textref, $filename) = @_;
1093
1094 if (!open (FILE, ">$filename")) {
1095 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
1096 die "\n";
1097 }
1098 print FILE $$textref;
1099
1100 close FILE;
1101}
1102
1103
1104sub filename_based_title
1105{
1106 my $self = shift (@_);
1107 my ($file) = @_;
1108
1109 my $file_derived_title = $file;
1110 $file_derived_title =~ s/_/ /g;
1111 $file_derived_title =~ s/\..*?$//;
1112
1113 return $file_derived_title;
1114}
1115
1116
1117sub title_fallback
1118{
1119 my $self = shift (@_);
1120 my ($doc_obj,$section,$file) = @_;
1121
1122 if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1123
1124 my $file_derived_title = $self->filename_based_title($file);
1125
1126 $doc_obj->add_utf8_metadata ($section, "Title", $self->filename_to_metadata($file_derived_title));
1127 }
1128}
1129
1130sub textcat_get_language_encoding {
1131 my $self = shift (@_);
1132 my ($filename) = @_;
1133
1134
1135 my ($language, $encoding, $extracted_encoding);
1136 if ($self->{'input_encoding'} eq "auto") {
1137 # use textcat to automatically work out the input encoding and language
1138 ($language, $encoding) = $self->get_language_encoding ($filename);
1139 } elsif ($self->{'extract_language'}) {
1140 # use textcat to get language metadata
1141 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1142 $encoding = $self->{'input_encoding'};
1143 # don't print this message for english... english in utf8 is identical
1144 # to english in iso-8859-1 (except for some punctuation). We don't have
1145 # a language model for en_utf8, so textcat always says iso-8859-1!
1146 if ($extracted_encoding ne $encoding && $language ne "en"
1147 && $self->{'verbosity'}) {
1148 my $plugin_name = ref ($self);
1149 my $outhandle = $self->{'outhandle'};
1150 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
1151 }
1152 } else {
1153 $language = $self->{'default_language'};
1154 $encoding = $self->{'input_encoding'};
1155 }
1156
1157 return ($language, $encoding);
1158}
1159
1160# Uses textcat to work out the encoding and language of the text in
1161# $filename. All html tags are removed before processing.
1162# returns an array containing "language" and "encoding"
1163sub get_language_encoding {
1164 my $self = shift (@_);
1165 my ($filename) = @_;
1166 my $outhandle = $self->{'outhandle'};
1167 my $unicode_format = "";
1168 my $best_language = "";
1169 my $best_encoding = "";
1170
1171 # read in file
1172 if (!open (FILE, $filename)) {
1173 gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1174 # this is a pretty bad error, but try to continue anyway
1175 return ($self->{'default_language'}, $self->{'input_encoding'});
1176 }
1177 undef $/;
1178 my $text = <FILE>;
1179 $/ = "\n";
1180 close FILE;
1181
1182 # check if first few bytes have a Byte Order Marker
1183 my $bom=substr($text,0,2); # check 16bit unicode
1184 if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1185 $unicode_format="unicode";
1186 } elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1187 $unicode_format="unicode";
1188 } else {
1189 $bom=substr($text,0,3); # check utf-8
1190 if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1191 $unicode_format="utf8";
1192# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1193# $unicode_format="utf8";
1194 }
1195 }
1196
1197
1198 # handle html files specially
1199 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1200 if (ref($self) eq 'HTMLPlug' ||
1201 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1202
1203 # remove <title>stuff</title> -- as titles tend often to be in English
1204 # for foreign language documents
1205 $text =~ s!<title>.*?</title>!!si;
1206
1207 # see if this html file specifies its encoding
1208 if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1209 $best_encoding = $1;
1210 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#"
1211 $best_encoding = $1;
1212 }
1213 if ($best_encoding) { # we extracted an encoding
1214 $best_encoding =~ s/-+/_/g;
1215 $best_encoding = lc($best_encoding); # lowercase
1216 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1217 $self->{'input_encoding'} = $best_encoding;
1218 }
1219
1220 # remove all HTML tags
1221 $text =~ s/<[^>]*>//sg;
1222 }
1223
1224 # get the language/encoding
1225 $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
1226 my $results = $self->{'textcat'}->classify(\$text);
1227
1228 # if textcat returns 3 or less possibilities we'll use the
1229 # first one in the list - otherwise use the defaults
1230 if (scalar @$results > 3) {
1231 if ($unicode_format) { # in case the first had a BOM
1232 $best_encoding=$unicode_format;
1233 } else {
1234 my %guessed_encodings = ();
1235 foreach my $result (@$results) {
1236 $result =~ /([^\-]+)$/;
1237 my $enc=$1;
1238 if (!defined($guessed_encodings{$enc})) {
1239 $guessed_encodings{$enc}=0;
1240 }
1241 $guessed_encodings{$enc}++;
1242 }
1243
1244 $guessed_encodings{""}=-1; # for default best_encoding of ""
1245 foreach my $enc (keys %guessed_encodings) {
1246 if ($guessed_encodings{$enc} >
1247 $guessed_encodings{$best_encoding}){
1248 $best_encoding=$enc;
1249 }
1250 }
1251 }
1252
1253 if ($self->{'input_encoding'} ne 'auto') {
1254 if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1255 gsprintf($outhandle,
1256 "BasPlug: {BasPlug.could_not_extract_language}\n",
1257 $filename, $self->{'default_language'});
1258 }
1259 $best_language = $self->{'default_language'};
1260 $best_encoding = $self->{'input_encoding'};
1261
1262 } else {
1263 if ($self->{'verbosity'}>2) {
1264 gsprintf($outhandle,
1265 "BasPlug: {BasPlug.could_not_extract_language}\n",
1266 $filename, $self->{'default_language'});
1267 }
1268 $best_language = $self->{'default_language'};
1269 }
1270 } else { # <= 3 suggestions
1271 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
1272 if (!defined $language) {
1273 if ($self->{'verbosity'}>2) {
1274 gsprintf($outhandle,
1275 "BasPlug: {BasPlug.could_not_extract_language}\n",
1276 $filename, $self->{'default_language'});
1277 }
1278 $language = $self->{'default_language'};
1279 }
1280 if (!defined $encoding) {
1281 if ($self->{'verbosity'}>2) {
1282 gsprintf($outhandle,
1283 "BasPlug: {BasPlug.could_not_extract_encoding}\n",
1284 $filename, $self->{'default_encoding'});
1285 }
1286 $encoding = $self->{'default_encoding'};
1287 }
1288 $best_language = $language;
1289 if (! $best_encoding ) { # may already be set... eg from html meta tag
1290 $best_encoding = $encoding;
1291 }
1292 }
1293
1294 my $text_copy = $text;
1295 if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1296 # the text is valid utf8, so assume that's the real encoding
1297 # (since textcat is based on probabilities)
1298 $best_encoding = 'utf8';
1299 }
1300
1301 # check for equivalents where textcat doesn't have some encodings...
1302 # eg MS versions of standard encodings
1303 if ($best_encoding =~ /^iso_8859_(\d+)/) {
1304 my $iso = $1; # which variant of the iso standard?
1305 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1306 if ($text =~ /[\x80-\x9f]/) {
1307 # Western Europe
1308 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1309 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1310 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1311 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1312 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1313 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1314 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
1315 }
1316 }
1317
1318 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
1319 !defined $encodings::encodings->{$best_encoding}) {
1320 if ($self->{'verbosity'}) {
1321 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1322 $filename, $best_encoding, $self->{'default_encoding'});
1323 }
1324 $best_encoding = $self->{'default_encoding'};
1325 }
1326
1327 return ($best_language, $best_encoding);
1328}
1329
1330# add any extra metadata that's been passed around from one
1331# plugin to another.
1332# extra_metadata uses add_utf8_metadata so it expects metadata values
1333# to already be in utf8
1334sub extra_metadata {
1335 my $self = shift (@_);
1336 my ($doc_obj, $cursection, $metadata) = @_;
1337
1338 my $associate_tail_re = $self->{'associate_tail_re'};
1339
1340 foreach my $field (keys(%$metadata)) {
1341 # $metadata->{$field} may be an array reference
1342 if ($field eq "gsdlassocfile_tobe") {
1343 # 'gsdlassocfile_tobe' is artificially introduced metadata
1344 # that is used to signal that certain additional files should
1345 # be tied to this document. Useful in situations where a
1346 # metadata pass in the plugin pipeline works out some files
1347 # need to be associated with a document, but the document hasn't
1348 # been formed yet.
1349
1350 my $equiv_form = "";
1351 foreach my $gaf (@{$metadata->{$field}}) {
1352 my ($full_filename,$mimetype) = ($gaf =~ m/^(.*):(.*):$/);
1353 my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1354 my $filename = $full_filename;
1355
1356 $doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1357
1358 # work out extended tail extension (i.e. matching tail re)
1359
1360 my ($file_prefix,$file_extended_ext)
1361 = $self->root_ext_split($tail_filename,$associate_tail_re);
1362 my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.*)\..*$/);
1363
1364 my ($doc_ext) = ($tail_filename =~ m/^.*\.(.*)$/);
1365 my $start_doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1366 my $srcicon = "_icon".$doc_ext."_";
1367 my $end_doclink = "</a>";
1368
1369 my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1370
1371 if (defined $pre_doc_ext) {
1372 # for metadata such as [mp3._edited] [mp3._full] ...
1373 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1374 }
1375
1376 # for multiple metadata such as [mp3.assoclink]
1377 $doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1378
1379 $equiv_form .= " $assoc_form";
1380 }
1381 $doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1382 }
1383 elsif (ref ($metadata->{$field}) eq "ARRAY") {
1384 map {
1385 $doc_obj->add_utf8_metadata ($cursection, $field, $_);
1386 } @{$metadata->{$field}};
1387 } else {
1388 $doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1389 }
1390 }
1391}
1392
1393# initialise metadata extractors
1394sub initialise_extractors {
1395 my $self = shift (@_);
1396
1397 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1398 &acronym::initialise_acronyms();
1399 }
1400}
1401
1402# finalise metadata extractors
1403sub finalise_extractors {
1404 my $self = shift (@_);
1405
1406 if ($self->{'extract_acronyms'} || $self->{'markup_acronyms'}) {
1407 &acronym::finalise_acronyms();
1408 }
1409}
1410
1411# FIRSTNNN: extract the first NNN characters as metadata
1412sub extract_first_NNNN_characters {
1413 my $self = shift (@_);
1414 my ($textref, $doc_obj, $thissection) = @_;
1415
1416 foreach my $size (split /,/, $self->{'first'}) {
1417 my $tmptext = $$textref;
1418 $tmptext =~ s/^\s+//;
1419 $tmptext =~ s/\s+$//;
1420 $tmptext =~ s/\s+/ /gs;
1421 $tmptext = substr ($tmptext, 0, $size);
1422 $tmptext =~ s/\s\S*$/&#8230;/;
1423 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1424 }
1425}
1426
1427sub extract_email {
1428 my $self = shift (@_);
1429 my ($textref, $doc_obj, $thissection) = @_;
1430 my $outhandle = $self->{'outhandle'};
1431
1432 gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1433 if ($self->{'verbosity'} > 2);
1434
1435 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|net|[a-z][a-z]))/g);
1436 @email = sort @email;
1437
1438# if($self->{"new_extract_email"} == 0)
1439# {
1440# my @email2 = ();
1441# foreach my $address (@email)
1442# {
1443# if (!(join(" ",@email2) =~ m/(^| )$address( |$)/ ))
1444# {
1445# push @email2, $address;
1446# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1447# # print $outhandle " extracting $address\n"
1448# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1449# if ($self->{'verbosity'} > 3);
1450# }
1451# }
1452# }
1453# else
1454# {
1455 my $hashExistMail = {};
1456 foreach my $address (@email) {
1457 if (!(defined $hashExistMail->{$address}))
1458 {
1459 $hashExistMail->{$address} = 1;
1460 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1461 gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1462 if ($self->{'verbosity'} > 3);
1463 }
1464 }
1465 gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1466 if ($self->{'verbosity'} > 2);
1467}
1468
1469# extract metadata
1470sub auto_extract_metadata {
1471
1472 my $self = shift (@_);
1473 my ($doc_obj) = @_;
1474
1475 if ($self->{'extract_email'}) {
1476 my $thissection = $doc_obj->get_top_section();
1477 while (defined $thissection) {
1478 my $text = $doc_obj->get_text($thissection);
1479 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1480 $thissection = $doc_obj->get_next_section ($thissection);
1481 }
1482 }
1483 if ($self->{'extract_placenames'}) {
1484 my $thissection = $doc_obj->get_top_section();
1485 while (defined $thissection) {
1486 my $text = $doc_obj->get_text($thissection);
1487 $self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1488 $thissection = $doc_obj->get_next_section ($thissection);
1489 }
1490 }
1491
1492 if ($self->{'extract_keyphrases'} || $self->{'extract_keyphrases_kea4'}) {
1493 $self->extract_keyphrases($doc_obj);
1494 }
1495
1496 if ($self->{'first'}) {
1497 my $thissection = $doc_obj->get_top_section();
1498 while (defined $thissection) {
1499 my $text = $doc_obj->get_text($thissection);
1500 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1501 $thissection = $doc_obj->get_next_section ($thissection);
1502 }
1503 }
1504
1505 if ($self->{'extract_acronyms'}) {
1506 my $thissection = $doc_obj->get_top_section();
1507 while (defined $thissection) {
1508 my $text = $doc_obj->get_text($thissection);
1509 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1510 $thissection = $doc_obj->get_next_section ($thissection);
1511 }
1512 }
1513
1514 if ($self->{'markup_acronyms'}) {
1515 my $thissection = $doc_obj->get_top_section();
1516 while (defined $thissection) {
1517 my $text = $doc_obj->get_text($thissection);
1518 $text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1519 $doc_obj->delete_text($thissection);
1520 $doc_obj->add_text($thissection, $text);
1521 $thissection = $doc_obj->get_next_section ($thissection);
1522 }
1523 }
1524
1525 if($self->{'extract_historical_years'}) {
1526 my $thissection = $doc_obj->get_top_section();
1527 while (defined $thissection) {
1528
1529 my $text = $doc_obj->get_text($thissection);
1530 &DateExtract::get_date_metadata($text, $doc_obj,
1531 $thissection,
1532 $self->{'no_bibliography'},
1533 $self->{'maximum_year'},
1534 $self->{'maximum_century'});
1535 $thissection = $doc_obj->get_next_section ($thissection);
1536 }
1537 }
1538}
1539
1540
1541#adding kea keyphrases
1542sub extract_keyphrases
1543{
1544 my $self = shift(@_);
1545 my $doc_obj = shift(@_);
1546
1547 # Use Kea 3.0 unless 4.0 has been specified
1548 my $kea_version = "3.0";
1549 if ($self->{'extract_keyphrases_kea4'}) {
1550 $kea_version = "4.0";
1551 }
1552
1553 # Check that Kea exists, and tell the user where to get it if not
1554 my $keahome = &Kea::get_Kea_directory($kea_version);
1555 if (!-e $keahome) {
1556 gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1557 return;
1558 }
1559
1560 my $thissection = $doc_obj->get_top_section();
1561 my $text = "";
1562 my $list;
1563
1564 #loop through sections to gather whole doc
1565 while (defined $thissection) {
1566 my $sectiontext = $doc_obj->get_text($thissection);
1567 $text = $text.$sectiontext;
1568 $thissection = $doc_obj->get_next_section ($thissection);
1569 }
1570
1571 if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1572 $list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1573 } else { #otherwise call Kea with no options
1574 $list = &Kea::extract_KeyPhrases ($kea_version, $text);
1575 }
1576
1577 if ($list){
1578 # if a list of kea keyphrases was returned (ie not empty)
1579 if ($self->{'verbosity'}) {
1580 gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1581 }
1582
1583 #add metadata to top section
1584 $thissection = $doc_obj->get_top_section();
1585
1586 # add all key phrases as one metadata
1587 $doc_obj->add_metadata($thissection, "Keyphrases", $list);
1588
1589 # add individual key phrases as multiple metadata
1590 foreach my $keyphrase (split(',', $list)) {
1591 $keyphrase =~ s/^\s+|\s+$//g;
1592 $doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1593 }
1594 }
1595}
1596
1597
1598# extract acronyms from a section in a document. progress is
1599# reported to outhandle based on the verbosity. both the Acronym
1600# and the AcronymKWIC metadata items are created.
1601
1602sub extract_acronyms {
1603 my $self = shift (@_);
1604 my ($textref, $doc_obj, $thissection) = @_;
1605 my $outhandle = $self->{'outhandle'};
1606
1607 # print $outhandle " extracting acronyms ...\n"
1608 gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1609 if ($self->{'verbosity'} > 2);
1610
1611 my $acro_array = &acronym::acronyms($textref);
1612
1613 foreach my $acro (@$acro_array) {
1614
1615 #check that this is the first time ...
1616 my $seen_before = "false";
1617 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1618 foreach my $thisAcro (@$previous_data) {
1619 if ($thisAcro eq $acro->to_string()) {
1620 $seen_before = "true";
1621 if ($self->{'verbosity'} >= 4) {
1622 gsprintf($outhandle, " {BasPlug.already_seen} " .
1623 $acro->to_string() . "\n");
1624 }
1625 }
1626 }
1627
1628 if ($seen_before eq "false") {
1629 #write it to the file ...
1630 $acro->write_to_file();
1631
1632 #do the normal acronym
1633 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1634 gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1635 if ($self->{'verbosity'} > 3);
1636 }
1637 }
1638
1639 gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1640 if ($self->{'verbosity'} > 2);
1641}
1642
1643sub markup_acronyms {
1644 my $self = shift (@_);
1645 my ($text, $doc_obj, $thissection) = @_;
1646 my $outhandle = $self->{'outhandle'};
1647
1648 gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1649 if ($self->{'verbosity'} > 2);
1650
1651 #self is passed in to check for verbosity ...
1652 $text = &acronym::markup_acronyms($text, $self);
1653
1654 gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1655 if ($self->{'verbosity'} > 2);
1656
1657 return $text;
1658}
1659
1660sub compile_stats {
1661 my $self = shift(@_);
1662 my ($stats) = @_;
1663
1664 $stats->{'num_processed'} += $self->{'num_processed'};
1665 $stats->{'num_not_processed'} += $self->{'num_not_processed'};
1666 $stats->{'num_archives'} += $self->{'num_archives'};
1667
1668}
1669
1670sub associate_cover_image {
1671 my $self = shift;
1672 my ($doc_obj, $filename) = @_;
1673
1674 $filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1675 if (exists $self->{'covers_missing_cache'}->{$filename}) {
1676 # don't stat() for existence eg for multiple document input files
1677 # (eg SplitPlug)
1678 return;
1679 }
1680
1681 my $top_section=$doc_obj->get_top_section();
1682
1683 if (-e $filename) {
1684 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1685 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1686 } else {
1687 my $upper_filename = $filename;
1688 $upper_filename =~ s/jpg$/JPG/;
1689 if (-e $upper_filename) {
1690 $doc_obj->associate_file($upper_filename, "cover.jpg",
1691 "image/jpeg");
1692 $doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1693 } else {
1694 # file doesn't exist, so record the fact that it's missing so
1695 # we don't stat() again (stat is slow)
1696 $self->{'covers_missing_cache'}->{$filename} = 1;
1697 }
1698 }
1699
1700}
1701
1702
1703# Overridden by exploding plugins (eg. ISISPlug)
1704sub clean_up_after_exploding
1705{
1706 my $self = shift(@_);
1707}
1708
1709
17101;
Note: See TracBrowser for help on using the repository browser.