Context Navigation

BasPlug.pm@ 11467

Last change on this file since 11467 was 11389, checked in by jrm21, 18 years ago

try to get the encoding from a '<meta http-equiv' tag if HTML.

make sure we add the filename/Source metadata as utf-8, so that we
won't create invalid xml files if the filename is in some other encoding.

If the file contents aren't in utf8, assume the filename is in the same
encoding as the contents.

print out the encoding we used to read this file if verbosity >= 3.

Property svn:keywords set to Author Date Id Revision

File size: 46.1 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39	no strict 'subs';
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	use File::Basename;
43
44	use Kea;
45	use multiread;
46	use encodings;
47	use unicode;
48	use cnseg;
49	use acronym;
50	use textcat;
51	use doc;
52	eval "require diagnostics"; # some perl distros (eg mac) don't have this
53	use DateExtract;
54	use ghtml;
55	use gsprintf 'gsprintf';
56	use printusage;
57	use parse2;
58
59
60	use GISBasPlug;
61
62	@BasPlug::ISA = ( GISBasPlug );
63
64	my $unicode_list =
65	[ { 'name' => "ascii",
66	'desc' => "{BasPlug.input_encoding.ascii}" },
67	{ 'name' => "utf8",
68	'desc' => "{BasPlug.input_encoding.utf8}" },
69	{ 'name' => "unicode",
70	'desc' => "{BasPlug.input_encoding.unicode}" } ];
71
72	my $auto_unicode_list =
73	[ { 'name' => "auto",
74	'desc' => "{BasPlug.input_encoding.auto}" } ];
75
76	my $e = $encodings::encodings;
77	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
78	{
79	my $hashEncode =
80	{'name' => $enc,
81	'desc' => $e->{$enc}->{'name'}};
82
83	push(@{$unicode_list},$hashEncode);
84	}
85
86	push(@{$auto_unicode_list},@{$unicode_list});
87
88	my $arguments =
89	[ { 'name' => "process_exp",
90	'desc' => "{BasPlug.process_exp}",
91	'type' => "regexp",
92	'deft' => "",
93	'reqd' => "no" },
94	{ 'name' => "block_exp",
95	'desc' => "{BasPlug.block_exp}",
96	'type' => "regexp",
97	'deft' => "",
98	'reqd' => "no" },
99	{ 'name' => "smart_block",
100	'desc' => "{BasPlug.smart_block}",
101	'type' => "flag",
102	'reqd' => "no" },
103	{ 'name' => "associate_ext",
104	'desc' => "{BasPlug.associate_ext}",
105	'type' => "string",
106	'reqd' => "no" },
107	{ 'name' => "associate_tail_re",
108	'desc' => "{BasPlug.associate_tail_re}",
109	'type' => "string",
110	'reqd' => "no" },
111	{ 'name' => "input_encoding",
112	'desc' => "{BasPlug.input_encoding}",
113	'type' => "enum",
114	'list' => $auto_unicode_list,
115	'reqd' => "no" ,
116	'deft' => "auto" } ,
117	{ 'name' => "default_encoding",
118	'desc' => "{BasPlug.default_encoding}",
119	'type' => "enum",
120	'list' => $unicode_list,
121	'reqd' => "no",
122	'deft' => "utf8" },
123	{ 'name' => "extract_language",
124	'desc' => "{BasPlug.extract_language}",
125	'type' => "flag",
126	'reqd' => "no" },
127	{ 'name' => "default_language",
128	'desc' => "{BasPlug.default_language}",
129	'type' => "string",
130	'deft' => "en",
131	'reqd' => "no" },
132	{ 'name' => "extract_acronyms",
133	'desc' => "{BasPlug.extract_acronyms}",
134	'type' => "flag",
135	'reqd' => "no" },
136	{ 'name' => "markup_acronyms",
137	'desc' => "{BasPlug.markup_acronyms}",
138	'type' => "flag",
139	'reqd' => "no" },
140	{ 'name' => "extract_keyphrases",
141	'desc' => "{BasPlug.extract_keyphrases}",
142	'type' => "flag",
143	'reqd' => "no" },
144	{ 'name' => "extract_keyphrases_kea4",
145	'desc' => "{BasPlug.extract_keyphrases_kea4}",
146	'type' => "flag",
147	'reqd' => "no" },
148	{ 'name' => "extract_keyphrase_options",
149	'desc' => "{BasPlug.extract_keyphrase_options}",
150	'type' => "string",
151	'deft' => "",
152	'reqd' => "no" },
153	{ 'name' => "first",
154	'desc' => "{BasPlug.first}",
155	'type' => "string",
156	'reqd' => "no" },
157	{ 'name' => "extract_email",
158	'desc' => "{BasPlug.extract_email}",
159	'type' => "flag",
160	'reqd' => "no" },
161	{ 'name' => "extract_historical_years",
162	'desc' => "{BasPlug.extract_historical_years}",
163	'type' => "flag",
164	'reqd' => "no" },
165	{ 'name' => "maximum_year",
166	'desc' => "{BasPlug.maximum_year}",
167	'type' => "int",
168	'deft' => (localtime)[5]+1900,
169	'char_length' => "4",
170	#'range' => "2,100",
171	'reqd' => "no"},
172	{ 'name' => "maximum_century",
173	'desc' => "{BasPlug.maximum_century}",
174	'type' => "string",
175	'deft' => "-1",
176	'reqd' => "no" },
177	{ 'name' => "no_bibliography",
178	'desc' => "{BasPlug.no_bibliography}",
179	'type' => "flag",
180	'reqd' => "no"},
181	{ 'name' => "no_cover_image",
182	'desc' => "{BasPlug.no_cover_image}",
183	'type' => "flag",
184	'reqd' => "no" },
185	{ 'name' => "separate_cjk",
186	'desc' => "{BasPlug.separate_cjk}",
187	'type' => "flag",
188	'reqd' => "no",
189	'hiddengli' => "yes" },
190	{ 'name' => "new_extract_email",
191	'desc' => "",
192	'type' => "flag",
193	'reqd' => "no",
194	'hiddengli' => "yes" } ];
195
196	my $gis_arguments =
197	[ { 'name' => "extract_placenames",
198	'desc' => "{GISBasPlug.extract_placenames}",
199	'type' => "flag",
200	'reqd' => "no" },
201	{ 'name' => "gazetteer",
202	'desc' => "{GISBasPlug.gazetteer}",
203	'type' => "string",
204	'reqd' => "no" },
205	{ 'name' => "place_list",
206	'desc' => "{GISBasPlug.place_list}",
207	'type' => "flag",
208	'reqd' => "no" } ];
209
210
211	my $options = { 'name' => "BasPlug",
212	'desc' => "{BasPlug.desc}",
213	'abstract' => "yes",
214	'inherits' => "no",
215	'args' => $arguments };
216
217
218	sub set_keepold {
219	my $self = shift(@_);
220	my ($keepold) = @_;
221
222	$self->{'keepold'} = $keepold;
223	}
224
225	sub get_arguments
226	{
227	my $self = shift(@_);
228	my $optionlistref = $self->{'option_list'};
229	my @optionlist = @$optionlistref;
230	my $pluginoptions = pop(@$optionlistref);
231	my $pluginarguments = $pluginoptions->{'args'};
232	return $pluginarguments;
233	}
234
235
236	sub print_xml_usage
237	{
238	my $self = shift(@_);
239
240	# XML output is always in UTF-8
241	gsprintf::output_strings_in_UTF8;
242
243	PrintUsage::print_xml_header();
244	$self->print_xml();
245	}
246
247
248	sub print_xml
249	{
250	my $self = shift(@_);
251
252	my $optionlistref = $self->{'option_list'};
253	my @optionlist = @$optionlistref;
254	my $pluginoptions = shift(@$optionlistref);
255	return if (!defined($pluginoptions));
256
257	gsprintf(STDERR, "<PlugInfo>\n");
258	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
259	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
260	$desc =~ s/</&lt;/g; # doubly escaped
261	$desc =~ s/>/&gt;/g;
262
263	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
264	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
265	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
266	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
267	gsprintf(STDERR, " <Arguments>\n");
268	if (defined($pluginoptions->{'args'})) {
269	&PrintUsage::print_options_xml($pluginoptions->{'args'});
270	}
271
272	# Recurse up the plugin hierarchy
273	$self->print_xml();
274
275	gsprintf(STDERR, " </Arguments>\n");
276	gsprintf(STDERR, "</PlugInfo>\n");
277	}
278
279
280	sub print_txt_usage
281	{
282	my $self = shift(@_);
283	# Print the usage message for a plugin (recursively)
284	my $descoffset = $self->determine_description_offset(0);
285	$self->print_plugin_usage($descoffset, 1);
286	}
287
288
289	sub determine_description_offset
290	{
291	my $self = shift(@_);
292	my $maxoffset = shift(@_);
293
294	my $optionlistref = $self->{'option_list'};
295	my @optionlist = @$optionlistref;
296	my $pluginoptions = shift(@$optionlistref);
297	return $maxoffset if (!defined($pluginoptions));
298
299	# Find the length of the longest option string of this plugin
300	my $pluginargs = $pluginoptions->{'args'};
301	if (defined($pluginargs)) {
302	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
303	if ($longest > $maxoffset) {
304	$maxoffset = $longest;
305	}
306	}
307
308	# Recurse up the plugin hierarchy
309	$maxoffset = $self->determine_description_offset($maxoffset);
310	$self->{'option_list'} = \@optionlist;
311	return $maxoffset;
312	}
313
314
315	sub print_plugin_usage
316	{
317	my $self = shift(@_);
318	my $descoffset = shift(@_);
319	my $isleafclass = shift(@_);
320
321	my $optionlistref = $self->{'option_list'};
322	my @optionlist = @$optionlistref;
323	my $pluginoptions = shift(@$optionlistref);
324	return if (!defined($pluginoptions));
325
326	my $pluginname = $pluginoptions->{'name'};
327	my $pluginargs = $pluginoptions->{'args'};
328	my $plugindesc = $pluginoptions->{'desc'};
329
330	# Produce the usage information using the data structure above
331	if ($isleafclass) {
332	if (defined($plugindesc)) {
333	gsprintf(STDERR, "$plugindesc\n\n");
334	}
335	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
336	}
337
338	# Display the plugin options, if there are some
339	if (defined($pluginargs)) {
340	# Calculate the column offset of the option descriptions
341	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
342
343	if ($isleafclass) {
344	gsprintf(STDERR, " {common.specific_options}:\n");
345	}
346	else {
347	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
348	}
349
350	# Display the plugin options
351	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
352	}
353
354	# Recurse up the plugin hierarchy
355	$self->print_plugin_usage($descoffset, 0);
356	$self->{'option_list'} = \@optionlist;
357	}
358
359
360	sub new {
361	# Set Encodings to the list!!
362
363
364	# Start the BasPlug Constructor
365	my $class = shift (@_);
366	my ($pluginlist,$args,$hashArgOptLists) = @_;
367	push(@$pluginlist, $class);
368	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
369
370	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
371	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
372
373	if (GISBasPlug::has_mapdata()) {
374	push(@$arguments,@$gis_arguments);
375	}
376
377	my $self = {};
378	$self->{'outhandle'} = STDERR;
379	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
380	$self->{"info_only"} = 0;
381
382	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
383	# the args, just return the object.
384	foreach my $strArg (@{$args})
385	{
386	if($strArg eq "-gsdlinfo")
387	{
388	$self->{"info_only"} = 1;
389	return bless $self, $class;
390	}
391	}
392
393	if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
394	{
395	my $classTempClass = bless $self, $class;
396	print STDERR "<BadPlugin p=$plugin_name>\n";
397	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
398	$classTempClass->print_txt_usage(""); # Use default resource bundle
399	die "\n";
400	}
401
402
403	delete $self->{"info_only"};
404	# else parsing was successful.
405
406	$self->{'plugin_type'} = $plugin_name;
407	#$self->{'outhandle'} = STDERR;
408	$self->{'textcat'} = new textcat();
409	$self->{'num_processed'} = 0;
410	$self->{'num_not_processed'} = 0;
411	$self->{'num_blocked'} = 0;
412	$self->{'num_archives'} = 0;
413	$self->{'cover_image'} = 1; # cover image is on by default
414	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
415	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
416
417	my $associate_ext = $self->{'associate_ext'};
418	if ((defined $associate_ext) && ($associate_ext ne "")) {
419
420	my $associate_tail_re = $self->{'associate_tail_re'};
421	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
422	my $outhandle = $self->{'outhandle'};
423	print $outhandle "Warning: can only specify 'associate_ext' or 'associate_tail_re'\n";
424	print $outhandle " defaulting to 'associate_tail_re'\n";
425	}
426	else {
427	my @exts = split(/,/,$associate_ext);
428
429	my @exts_bracketed = map { $_ = "(?:\\.$_)" } @exts;
430	my $associate_tail_re = join("\|",@exts_bracketed);
431	$self->{'associate_tail_re'} = $associate_tail_re;
432	}
433
434	delete $self->{'associate_ext'};
435	}
436
437	$self->{'shared_fileroot'} = {};
438	$self->{'file_blocks'} = {};
439
440	if ($self->{'extract_placenames'}) {
441
442	my $outhandle = $self->{'outhandle'};
443
444	my $places_ref
445	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
446
447	if (!defined $places_ref) {
448	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
449	print $outhandle " No placename extraction will take place.\n";
450	$self->{'extract_placenames'} = undef;
451	}
452	else {
453	$self->{'places'} = $places_ref;
454	}
455	}
456
457	return bless $self, $class;
458
459	}
460
461	# initialize BasPlug options
462	# if init() is overridden in a sub-class, remember to call BasPlug::init()
463	sub init {
464	my $self = shift (@_);
465	my ($verbosity, $outhandle, $failhandle) = @_;
466
467	# verbosity is passed through from the processor
468	$self->{'verbosity'} = $verbosity;
469
470	# as are the outhandle and failhandle
471	$self->{'outhandle'} = $outhandle if defined $outhandle;
472	$self->{'failhandle'} = $failhandle;
473
474	# set process_exp and block_exp to defaults unless they were
475	# explicitly set
476
477	if ((!$self->is_recursive()) and
478	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
479
480	$self->{'process_exp'} = $self->get_default_process_exp ();
481	if ($self->{'process_exp'} eq "") {
482	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
483	}
484	}
485
486	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
487	$self->{'block_exp'} = $self->get_default_block_exp ();
488	}
489
490	}
491
492	sub begin {
493	my $self = shift (@_);
494	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
495	$self->initialise_extractors();
496	}
497
498	sub end {
499	# potentially called at the end of each plugin pass
500	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
501
502	my ($self) = @_;
503	$self->finalise_extractors();
504	}
505
506	sub deinit {
507	# called only once, after all plugin passes have been done
508
509	my ($self) = @_;
510	}
511
512	# this function should be overridden to return 1
513	# in recursive plugins
514	sub is_recursive {
515	my $self = shift (@_);
516
517	return 0;
518	}
519
520	sub get_default_block_exp {
521	my $self = shift (@_);
522
523	return "";
524	}
525
526	sub get_default_process_exp {
527	my $self = shift (@_);
528
529	return "";
530	}
531
532	# default implementation is to do nothing.
533	sub store_block_files
534	{
535	my $self =shift (@_);
536	my ($filename) = @_;
537	return;
538	}
539
540	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
541	sub block_cover_image
542	{
543	my $self =shift;
544	my $filename = shift;
545
546	if ($self->{'cover_image'}) {
547	my $coverfile = $filename;
548	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
549	if (!-e $coverfile) {
550	$coverfile =~ s/jpg$/JPG/;
551	}
552	if (-e $coverfile) {
553	$self->{'file_blocks'}->{$coverfile} = 1;
554	}
555	}
556
557	return;
558	}
559
560	sub root_ext_split
561	{
562	my $self = shift (@_);
563	my ($filename,$tail_re) = @_;
564
565	my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
566
567	if ((!defined $file_prefix) \|\| (!defined $file_ext)) {
568	($file_prefix,$file_ext) = ($filename =~ m/^(.)(\..?)$/);
569	}
570
571	return ($file_prefix,$file_ext);
572	}
573
574	sub metadata_read {
575	my $self = shift (@_);
576	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
577	# Keep track of filenames with same root but different extensions
578	# Used to support -associate_ext and the more generalised
579	# -associate_tail_re
580
581	my $associate_tail_re = $self->{'associate_tail_re'};
582	if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
583
584	my ($file_prefix,$file_ext)
585	= $self->root_ext_split($file,$associate_tail_re);
586
587	if ((defined $file_prefix) && (defined $file_ext)) {
588
589	my $shared_fileroot = $self->{'shared_fileroot'};
590	if (!defined $shared_fileroot->{$file_prefix}) {
591	my $file_prefix_rec = { 'tie_to' => undef,
592	'exts' => {} };
593	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
594	}
595
596	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
597
598	my $process_exp = $self->{'process_exp'};
599
600	if ($file =~ m/$process_exp/) {
601	# This is the document the others should be tied to
602	$file_prefix_rec->{'tie_to'} = $file_ext;
603	}
604	else {
605	if ($file_ext =~ m/$associate_tail_re$/) {
606	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
607	}
608	}
609
610	}
611	}
612
613	# now check whether we are actually processing this
614	my $filename = $file;
615	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
616	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
617	return undef; # can't recognise
618	}
619
620	# do smart blocking if appropriate
621	if ($self->{'smart_block'}) {
622	$self->store_block_files($filename);
623	}
624	# block the cover image if there is one
625	if ($self->{'cover_image'}) {
626	$self->block_cover_image($filename);
627	}
628
629	return 1;
630	}
631
632	sub tie_to_filename
633	{
634	my $self = shift (@_);
635
636	my ($file_ext,$file_prefix_rec) = @_;
637
638	if (defined $file_prefix_rec) {
639	my $tie_to = $file_prefix_rec->{'tie_to'};
640
641	if (defined $tie_to) {
642	if ($tie_to eq $file_ext) {
643	return 1;
644	}
645	}
646	}
647
648	return 0;
649	}
650
651	sub tie_to_assoc_file
652	{
653	my $self = shift (@_);
654	my ($file_ext,$file_prefix_rec) = @_;
655
656	if (defined $file_prefix_rec) {
657	my $tie_to = $file_prefix_rec->{'tie_to'};
658	if (defined $tie_to) {
659
660	my $exts = $file_prefix_rec->{'exts'};
661
662	my $has_file_ext = $exts->{$file_ext};
663
664	if ($has_file_ext) {
665	return 1;
666	}
667	}
668	}
669
670	return 0;
671	}
672
673
674	sub associate_with
675	{
676	my $self = shift (@_);
677	my ($file, $filename, $metadata) = @_;
678
679	my $associate_tail_re = $self->{'associate_tail_re'};
680	return 0 if (!$associate_tail_re);
681
682	# If file, see if matches with "tie_to" doc or is one of the
683	# associated filename extensions.
684
685	my ($file_prefix,$file_ext) = $self->root_ext_split($file,$associate_tail_re);
686
687	if ((defined $file_prefix) && (defined $file_ext)) {
688
689	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
690
691	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
692
693	# Set up gsdlassocfile_tobe
694
695	my $exts = $file_prefix_rec->{'exts'};
696
697	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
698	$metadata->{'gsdlassocfile_tobe'} = [];
699	}
700
701	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
702
703	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
704	foreach my $e (keys %$exts) {
705	my $assoc_file = "$full_prefix$e";
706	print STDERR " $self->{'plugin_type'}: Associating $file_prefix$e with $file_prefix_rec->{'tie_to'} version\n";
707	my $mime_type = ""; # let system auto detect this
708	push(@$assoc_tobe,"$assoc_file:$mime_type:");
709	}
710
711	}
712	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
713
714
715	# a form of smart block
716	return 1;
717	}
718	}
719
720	return 0;
721	}
722
723
724	sub read_block {
725	my $self = shift (@_);
726
727	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
728
729
730	my $filename = $file;
731	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
732
733	if ($self->associate_with($file,$filename,$metadata)) {
734	# a form of smart block
735	$self->{'num_blocked'} ++;
736	return (0,undef); # blocked
737	}
738
739	my $smart_block = $self->{'smart_block'};
740	my $smart_block_BN = $self->{'smart_block_BN'};
741
742	if ($smart_block \|\| $smart_block_BN) {
743	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
744	$self->{'num_blocked'} ++;
745	return (0,undef); # blocked
746	}
747	} else {
748	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
749	$self->{'num_blocked'} ++;
750	return (0,undef); # blocked
751	}
752	if ($self->{'cover_image'}) {
753	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
754	$self->{'num_blocked'} ++;
755	return (0,undef); # blocked
756	}
757	}
758	}
759
760	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
761	return (undef,undef); # can't recognise
762	}
763
764	return (1,$filename);
765	}
766
767	sub read_tidy_file {
768
769	my $self = shift (@_);
770
771	my ($file) = @_;
772
773	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
774
775	return $file;
776	}
777
778
779
780	# The BasPlug read_into_doc_obj() function. This function does all the
781	# right things to make general options work for a given plugin. It reads in
782	# a file and sets up a slew of metadata all saved in doc_obj, which
783	# it then returns as part of a tuple (process_status,doc_obj)
784	#
785	# Much of this functionality used to reside in read, but it was broken
786	# down into a supporting routine to make the code more flexible.
787	#
788	# recursive plugins (e.g. RecPlug) and specialized plugins like those
789	# capable of processing many documents within a single file (e.g.
790	# GMLPlug) will normally want to implement their own version of
791	# read_into_doc_obj()
792	#
793	# Note that $base_dir might be "" and that $file might
794	# include directories
795	sub read_into_doc_obj {
796	my $self = shift (@_);
797	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
798
799	if ($self->is_recursive()) {
800	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
801	}
802
803	my $outhandle = $self->{'outhandle'};
804
805	my ($block_status,$filename) = $self->read_block(@_);
806	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
807	$file = $self->read_tidy_file($file);
808
809	# Do encoding stuff
810	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
811	if ($self->{'verbosity'} > 2) {
812	print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
813	}
814
815	# create a new document
816	my $doc_obj = new doc ($filename, "indexed_doc");
817	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
818	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
819	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
820	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
821	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
822
823	my ($filemeta) = $file =~ /([^\\\/]+)$/;
824	# how do we know what encoding the filename is in?
825	# assume it is in the same encoding as its contents
826	if ($encoding !~ /(?:ascii\|utf8\|unicode)/) {
827	$filemeta = unicode::unicode2utf8(
828	unicode::convert2unicode($encoding, \$filemeta)
829	);
830	}
831	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
832	if ($self->{'cover_image'}) {
833	$self->associate_cover_image($doc_obj, $filename);
834	}
835
836	# read in file ($text will be in utf8)
837	my $text = "";
838	$self->read_file ($filename, $encoding, $language, \$text);
839
840	if (!length ($text)) {
841	my $plugin_name = ref ($self);
842	if ($gli) {
843	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
844	}
845	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
846
847	my $failhandle = $self->{'failhandle'};
848	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
849	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
850	$self->{'num_not_processed'} ++;
851
852	return (0,undef); # what should we return here?? error but don't want to pass it on
853	}
854
855	# include any metadata passed in from previous plugins
856	# note that this metadata is associated with the top level section
857
858	my $associate_tail_re = $self->{'associate_tail_re'};
859
860	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
861
862	# do plugin specific processing of doc_obj
863	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
864	$text = '';
865	undef $text;
866	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
867	return (-1,undef);
868	}
869	$text='';
870	undef $text;
871
872	# do any automatic metadata extraction
873	$self->auto_extract_metadata ($doc_obj);
874
875	# add an OID
876	# see if there is a plugin-specific set_OID function...
877	if (defined ($self->can('set_OID'))) {
878	# it will need $doc_obj to set the Identifier metadata...
879	$self->set_OID($doc_obj);
880	} else {
881	# use the default set_OID() in doc.pm
882	$doc_obj->set_OID();
883	}
884
885	return (1,$doc_obj);
886	}
887
888
889	# The BasPlug read() function. This function calls read_into_doc_obj()
890	# to ensure all the right things to make general options work for a
891	# given plugin are done. It then calls the process() function which
892	# does all the work specific to a plugin (like the old read functions
893	# used to do). Most plugins should define their own process() function
894	# and let this read() function keep control.
895	#
896	# recursive plugins (e.g. RecPlug) and specialized plugins like those
897	# capable of processing many documents within a single file (e.g.
898	# GMLPlug) might want to implement their own version of read(), but
899	# more likely need to implement their own version of read_into_doc_obj()
900	#
901	# Return number of files processed, undef if can't recognise, -1 if can't
902	# process
903
904	sub read {
905	my $self = shift (@_);
906	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
907
908	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
909
910	if ((defined $process_status) && ($process_status == 1)) {
911	# process the document
912	$processor->process($doc_obj);
913
914	if(defined($self->{'places_filename'})){
915	&util::rm($self->{'places_filename'});
916	$self->{'places_filename'} = undef;
917	}
918
919	$self->{'num_processed'} ++;
920	undef $doc_obj;
921	}
922
923	# if process_status == 1, then the file has been processed.
924	return $process_status;
925
926	}
927
928	# returns undef if file is rejected by the plugin
929	sub process {
930	my $self = shift (@_);
931	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
932
933	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
934	# die "Basplug::process function must be implemented in sub-class\n";
935
936	return undef; # never gets here
937	}
938
939	# uses the multiread package to read in the entire file pointed to
940	# by filename and loads the resulting text into $$textref. Input text
941	# may be in any of the encodings handled by multiread, output text
942	# will be in utf8
943	sub read_file {
944	my $self = shift (@_);
945	my ($filename, $encoding, $language, $textref) = @_;
946
947	if (!-r $filename)
948	{
949	my $outhandle = $self->{'outhandle'};
950	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
951	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
952	return;
953	}
954	$$textref = "";
955	if (!open (FILE, $filename)) {
956	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
957	die "\n";
958	}
959
960	if ($encoding eq "ascii") {
961	undef $/;
962	$$textref = <FILE>;
963	$/ = "\n";
964	} else {
965	my $reader = new multiread();
966	$reader->set_handle ('BasPlug::FILE');
967	$reader->set_encoding ($encoding);
968	$reader->read_file ($textref);
969	#Now segments chinese if the separate_cjk option is set
970	if ($self->{'separate_cjk'}) {
971	# segment the Chinese words
972	$$textref = &cnseg::segment($$textref);
973	}
974	}
975	close FILE;
976	}
977
978	# write_file -- used by ConvertToPlug, for example in post processing
979	#
980	sub utf8_write_file {
981	my $self = shift (@_);
982	my ($textref, $filename) = @_;
983
984	if (!open (FILE, ">$filename")) {
985	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
986	die "\n";
987	}
988	print FILE $$textref;
989
990	close FILE;
991	}
992
993
994	sub filename_based_title
995	{
996	my $self = shift (@_);
997	my ($file) = @_;
998
999	my $file_derived_title = $file;
1000	$file_derived_title =~ s/_/ /g;
1001	$file_derived_title =~ s/\..*?$//;
1002
1003	return $file_derived_title;
1004	}
1005
1006
1007	sub title_fallback
1008	{
1009	my $self = shift (@_);
1010	my ($doc_obj,$section,$file) = @_;
1011
1012	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
1013
1014	my $file_derived_title = $self->filename_based_title($file);
1015	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
1016	}
1017	}
1018
1019	sub textcat_get_language_encoding {
1020	my $self = shift (@_);
1021	my ($filename) = @_;
1022
1023
1024	my ($language, $encoding, $extracted_encoding);
1025	if ($self->{'input_encoding'} eq "auto") {
1026	# use textcat to automatically work out the input encoding and language
1027	($language, $encoding) = $self->get_language_encoding ($filename);
1028	} elsif ($self->{'extract_language'}) {
1029	# use textcat to get language metadata
1030	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
1031	$encoding = $self->{'input_encoding'};
1032	# don't print this message for english... english in utf8 is identical
1033	# to english in iso-8859-1 (except for some punctuation). We don't have
1034	# a language model for en_utf8, so textcat always says iso-8859-1!
1035	if ($extracted_encoding ne $encoding && $language ne "en"
1036	&& $self->{'verbosity'}) {
1037	my $plugin_name = ref ($self);
1038	my $outhandle = $self->{'outhandle'};
1039	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
1040	}
1041	} else {
1042	$language = $self->{'default_language'};
1043	$encoding = $self->{'input_encoding'};
1044	}
1045
1046	return ($language, $encoding);
1047	}
1048
1049	# Uses textcat to work out the encoding and language of the text in
1050	# $filename. All html tags are removed before processing.
1051	# returns an array containing "language" and "encoding"
1052	sub get_language_encoding {
1053	my $self = shift (@_);
1054	my ($filename) = @_;
1055	my $outhandle = $self->{'outhandle'};
1056	my $unicode_format = "";
1057	my $best_language = "";
1058	my $best_encoding = "";
1059
1060	# read in file
1061	if (!open (FILE, $filename)) {
1062	gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
1063	# this is a pretty bad error, but try to continue anyway
1064	return ($self->{'default_language'}, $self->{'input_encoding'});
1065	}
1066	undef $/;
1067	my $text = <FILE>;
1068	$/ = "\n";
1069	close FILE;
1070
1071	# check if first few bytes have a Byte Order Marker
1072	my $bom=substr($text,0,2); # check 16bit unicode
1073	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1074	$unicode_format="unicode";
1075	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1076	$unicode_format="unicode";
1077	} else {
1078	$bom=substr($text,0,3); # check utf-8
1079	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1080	$unicode_format="utf8";
1081	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1082	# $unicode_format="utf8";
1083	}
1084	}
1085
1086
1087	# handle html files specially
1088	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1089	if (ref($self) eq 'HTMLPlug' \|\|
1090	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1091
1092	# remove <title>stuff</title> -- as titles tend often to be in English
1093	# for foreign language documents
1094	$text =~ s!<title>.*?</title>!!si;
1095
1096	# see if this html file specifies its encoding
1097	if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
1098	$best_encoding = $1;
1099	} elsif ($text =~ /<meta http-equiv.content-type.charset=(.+?)"/i) {
1100	$best_encoding = $1;
1101	}
1102	if ($best_encoding) { # we extracted an encoding
1103	$best_encoding =~ s/-+/_/g;
1104	$best_encoding = lc($best_encoding); # lowercase
1105	if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
1106	$self->{'input_encoding'} = $best_encoding;
1107	}
1108
1109	# remove all HTML tags
1110	$text =~ s/<[^>]*>//sg;
1111	}
1112
1113	# get the language/encoding
1114	my $results = $self->{'textcat'}->classify(\$text);
1115
1116	# if textcat returns 3 or less possibilities we'll use the
1117	# first one in the list - otherwise use the defaults
1118	if (scalar @$results > 3) {
1119	if ($unicode_format) { # in case the first had a BOM
1120	$best_encoding=$unicode_format;
1121	} else {
1122	my %guessed_encodings = ();
1123	foreach my $result (@$results) {
1124	$result =~ /([^\-]+)$/;
1125	my $enc=$1;
1126	if (!defined($guessed_encodings{$enc})) {
1127	$guessed_encodings{$enc}=0;
1128	}
1129	$guessed_encodings{$enc}++;
1130	}
1131
1132	$guessed_encodings{""}=-1; # for default best_encoding of ""
1133	foreach my $enc (keys %guessed_encodings) {
1134	if ($guessed_encodings{$enc} >
1135	$guessed_encodings{$best_encoding}){
1136	$best_encoding=$enc;
1137	}
1138	}
1139	}
1140
1141	if ($self->{'input_encoding'} ne 'auto') {
1142	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1143	gsprintf($outhandle,
1144	"BasPlug: {BasPlug.could_not_extract_language}\n",
1145	$filename, $self->{'default_language'});
1146	}
1147	$best_language = $self->{'default_language'};
1148	$best_encoding = $self->{'input_encoding'};
1149
1150	} else {
1151	if ($self->{'verbosity'}>2) {
1152	gsprintf($outhandle,
1153	"BasPlug: {BasPlug.could_not_extract_language}\n",
1154	$filename, $self->{'default_language'});
1155	}
1156	$best_language = $self->{'default_language'};
1157	}
1158	} else { # <= 3 suggestions
1159	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
1160	if (!defined $language) {
1161	if ($self->{'verbosity'}>2) {
1162	gsprintf($outhandle,
1163	"BasPlug: {BasPlug.could_not_extract_language}\n",
1164	$filename, $self->{'default_language'});
1165	}
1166	$language = $self->{'default_language'};
1167	}
1168	if (!defined $encoding) {
1169	if ($self->{'verbosity'}>2) {
1170	gsprintf($outhandle,
1171	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
1172	$filename, $self->{'default_encoding'});
1173	}
1174	$encoding = $self->{'default_encoding'};
1175	}
1176	$best_language = $language;
1177	if (! $best_encoding ) { # may already be set... eg from html meta tag
1178	$best_encoding = $encoding;
1179	}
1180	}
1181
1182	my $text_copy = $text;
1183	if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
1184	# the text is valid utf8, so assume that's the real encoding
1185	# (since textcat is based on probabilities)
1186	$best_encoding = 'utf8';
1187	}
1188
1189	# check for equivalents where textcat doesn't have some encodings...
1190	# eg MS versions of standard encodings
1191	if ($best_encoding =~ /^iso_8859_(\d+)/) {
1192	my $iso = $1; # which variant of the iso standard?
1193	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1194	if ($text =~ /[\x80-\x9f]/) {
1195	# Western Europe
1196	if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
1197	elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
1198	elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
1199	elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
1200	elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
1201	elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
1202	elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
1203	}
1204	}
1205
1206	if ($best_encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1207	!defined $encodings::encodings->{$best_encoding}) {
1208	if ($self->{'verbosity'}) {
1209	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1210	$filename, $best_encoding, $self->{'default_encoding'});
1211	}
1212	$best_encoding = $self->{'default_encoding'};
1213	}
1214
1215	return ($best_language, $best_encoding);
1216	}
1217
1218	# add any extra metadata that's been passed around from one
1219	# plugin to another.
1220	# extra_metadata uses add_utf8_metadata so it expects metadata values
1221	# to already be in utf8
1222	sub extra_metadata {
1223	my $self = shift (@_);
1224	my ($doc_obj, $cursection, $metadata) = @_;
1225
1226	my $associate_tail_re = $self->{'associate_tail_re'};
1227
1228	foreach my $field (keys(%$metadata)) {
1229	# $metadata->{$field} may be an array reference
1230	if ($field eq "gsdlassocfile_tobe") {
1231	# 'gsdlassocfile_tobe' is artificially introduced metadata
1232	# that is used to signal that certain additional files should
1233	# be tied to this document. Useful in situations where a
1234	# metadata pass in the plugin pipeline works out some files
1235	# need to be associated with a document, but the document hasn't
1236	# been formed yet.
1237
1238	my $equiv_form = "";
1239	foreach my $gaf (@{$metadata->{$field}}) {
1240	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1241	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1242	my $filename = $full_filename;
1243
1244	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1245
1246	# work out extended tail extension (i.e. matching tail re)
1247
1248	my ($file_prefix,$file_extended_ext)
1249	= $self->root_ext_split($tail_filename,$associate_tail_re);
1250	my ($pre_doc_ext) = ($file_extended_ext =~ m/^(.)\..$/);
1251
1252	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1253	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1254	my $srcicon = "_icon".$doc_ext."_";
1255	my $end_doclink = "</a>";
1256
1257	my $assoc_form = "$start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1258
1259	if (defined $pre_doc_ext) {
1260	# for metadata such as [mp3._edited] [mp3._full] ...
1261	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.$pre_doc_ext", $assoc_form);
1262	}
1263
1264	# for multiple metadata such as [mp3.assoclink]
1265	$doc_obj->add_utf8_metadata ($cursection, "$doc_ext.assoclink", $assoc_form);
1266
1267	$equiv_form .= " $assoc_form";
1268	}
1269	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1270	}
1271	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1272	map {
1273	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1274	} @{$metadata->{$field}};
1275	} else {
1276	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1277	}
1278	}
1279	}
1280
1281	# initialise metadata extractors
1282	sub initialise_extractors {
1283	my $self = shift (@_);
1284
1285	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1286	&acronym::initialise_acronyms();
1287	}
1288	}
1289
1290	# finalise metadata extractors
1291	sub finalise_extractors {
1292	my $self = shift (@_);
1293
1294	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1295	&acronym::finalise_acronyms();
1296	}
1297	}
1298
1299	# FIRSTNNN: extract the first NNN characters as metadata
1300	sub extract_first_NNNN_characters {
1301	my $self = shift (@_);
1302	my ($textref, $doc_obj, $thissection) = @_;
1303
1304	foreach my $size (split /,/, $self->{'first'}) {
1305	my $tmptext = $$textref;
1306	$tmptext =~ s/^\s+//;
1307	$tmptext =~ s/\s+$//;
1308	$tmptext =~ s/\s+/ /gs;
1309	$tmptext = substr ($tmptext, 0, $size);
1310	$tmptext =~ s/\s\S*$/…/;
1311	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1312	}
1313	}
1314
1315	sub extract_email {
1316	my $self = shift (@_);
1317	my ($textref, $doc_obj, $thissection) = @_;
1318	my $outhandle = $self->{'outhandle'};
1319
1320	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1321	if ($self->{'verbosity'} > 2);
1322
1323	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1324	@email = sort @email;
1325
1326	# if($self->{"new_extract_email"} == 0)
1327	# {
1328	# my @email2 = ();
1329	# foreach my $address (@email)
1330	# {
1331	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
1332	# {
1333	# push @email2, $address;
1334	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1335	# # print $outhandle " extracting $address\n"
1336	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1337	# if ($self->{'verbosity'} > 3);
1338	# }
1339	# }
1340	# }
1341	# else
1342	# {
1343	my $hashExistMail = {};
1344	foreach my $address (@email) {
1345	if (!(defined $hashExistMail->{$address}))
1346	{
1347	$hashExistMail->{$address} = 1;
1348	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1349	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1350	if ($self->{'verbosity'} > 3);
1351	}
1352	}
1353	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1354	if ($self->{'verbosity'} > 2);
1355	}
1356
1357	# extract metadata
1358	sub auto_extract_metadata {
1359
1360	my $self = shift (@_);
1361	my ($doc_obj) = @_;
1362
1363	if ($self->{'extract_email'}) {
1364	my $thissection = $doc_obj->get_top_section();
1365	while (defined $thissection) {
1366	my $text = $doc_obj->get_text($thissection);
1367	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1368	$thissection = $doc_obj->get_next_section ($thissection);
1369	}
1370	}
1371	if ($self->{'extract_placenames'}) {
1372	my $thissection = $doc_obj->get_top_section();
1373	while (defined $thissection) {
1374	my $text = $doc_obj->get_text($thissection);
1375	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1376	$thissection = $doc_obj->get_next_section ($thissection);
1377	}
1378	}
1379
1380	if ($self->{'extract_keyphrases'} \|\| $self->{'extract_keyphrases_kea4'}) {
1381	$self->extract_keyphrases($doc_obj);
1382	}
1383
1384	if ($self->{'first'}) {
1385	my $thissection = $doc_obj->get_top_section();
1386	while (defined $thissection) {
1387	my $text = $doc_obj->get_text($thissection);
1388	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1389	$thissection = $doc_obj->get_next_section ($thissection);
1390	}
1391	}
1392
1393	if ($self->{'extract_acronyms'}) {
1394	my $thissection = $doc_obj->get_top_section();
1395	while (defined $thissection) {
1396	my $text = $doc_obj->get_text($thissection);
1397	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1398	$thissection = $doc_obj->get_next_section ($thissection);
1399	}
1400	}
1401
1402	if ($self->{'markup_acronyms'}) {
1403	my $thissection = $doc_obj->get_top_section();
1404	while (defined $thissection) {
1405	my $text = $doc_obj->get_text($thissection);
1406	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1407	$doc_obj->delete_text($thissection);
1408	$doc_obj->add_text($thissection, $text);
1409	$thissection = $doc_obj->get_next_section ($thissection);
1410	}
1411	}
1412
1413	if($self->{'extract_historical_years'}) {
1414	my $thissection = $doc_obj->get_top_section();
1415	while (defined $thissection) {
1416
1417	my $text = $doc_obj->get_text($thissection);
1418	&DateExtract::get_date_metadata($text, $doc_obj,
1419	$thissection,
1420	$self->{'no_bibliography'},
1421	$self->{'maximum_year'},
1422	$self->{'maximum_century'});
1423	$thissection = $doc_obj->get_next_section ($thissection);
1424	}
1425	}
1426	}
1427
1428
1429	#adding kea keyphrases
1430	sub extract_keyphrases
1431	{
1432	my $self = shift(@_);
1433	my $doc_obj = shift(@_);
1434
1435	# Use Kea 3.0 unless 4.0 has been specified
1436	my $kea_version = "3.0";
1437	if ($self->{'extract_keyphrases_kea4'}) {
1438	$kea_version = "4.0";
1439	}
1440
1441	# Check that Kea exists, and tell the user where to get it if not
1442	my $keahome = &Kea::get_Kea_directory($kea_version);
1443	if (!-e $keahome) {
1444	gsprintf(STDERR, "{BasPlug.missing_kea}\n", $keahome, $kea_version);
1445	return;
1446	}
1447
1448	my $thissection = $doc_obj->get_top_section();
1449	my $text = "";
1450	my $list;
1451
1452	#loop through sections to gather whole doc
1453	while (defined $thissection) {
1454	my $sectiontext = $doc_obj->get_text($thissection);
1455	$text = $text.$sectiontext;
1456	$thissection = $doc_obj->get_next_section ($thissection);
1457	}
1458
1459	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1460	$list = &Kea::extract_KeyPhrases ($kea_version, $text, $self->{'extract_keyphrase_options'});
1461	} else { #otherwise call Kea with no options
1462	$list = &Kea::extract_KeyPhrases ($kea_version, $text);
1463	}
1464
1465	if ($list){
1466	# if a list of kea keyphrases was returned (ie not empty)
1467	if ($self->{'verbosity'}) {
1468	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1469	}
1470
1471	#add metadata to top section
1472	$thissection = $doc_obj->get_top_section();
1473
1474	# add all key phrases as one metadata
1475	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1476
1477	# add individual key phrases as multiple metadata
1478	foreach my $keyphrase (split(',', $list)) {
1479	$keyphrase =~ s/^\s+\|\s+$//g;
1480	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1481	}
1482	}
1483	}
1484
1485
1486	# extract acronyms from a section in a document. progress is
1487	# reported to outhandle based on the verbosity. both the Acronym
1488	# and the AcronymKWIC metadata items are created.
1489
1490	sub extract_acronyms {
1491	my $self = shift (@_);
1492	my ($textref, $doc_obj, $thissection) = @_;
1493	my $outhandle = $self->{'outhandle'};
1494
1495	# print $outhandle " extracting acronyms ...\n"
1496	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1497	if ($self->{'verbosity'} > 2);
1498
1499	my $acro_array = &acronym::acronyms($textref);
1500
1501	foreach my $acro (@$acro_array) {
1502
1503	#check that this is the first time ...
1504	my $seen_before = "false";
1505	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1506	foreach my $thisAcro (@$previous_data) {
1507	if ($thisAcro eq $acro->to_string()) {
1508	$seen_before = "true";
1509	if ($self->{'verbosity'} >= 4) {
1510	gsprintf($outhandle, " {BasPlug.already_seen} " .
1511	$acro->to_string() . "\n");
1512	}
1513	}
1514	}
1515
1516	if ($seen_before eq "false") {
1517	#write it to the file ...
1518	$acro->write_to_file();
1519
1520	#do the normal acronym
1521	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1522	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1523	if ($self->{'verbosity'} > 3);
1524	}
1525	}
1526
1527	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1528	if ($self->{'verbosity'} > 2);
1529	}
1530
1531	sub markup_acronyms {
1532	my $self = shift (@_);
1533	my ($text, $doc_obj, $thissection) = @_;
1534	my $outhandle = $self->{'outhandle'};
1535
1536	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1537	if ($self->{'verbosity'} > 2);
1538
1539	#self is passed in to check for verbosity ...
1540	$text = &acronym::markup_acronyms($text, $self);
1541
1542	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1543	if ($self->{'verbosity'} > 2);
1544
1545	return $text;
1546	}
1547
1548	sub compile_stats {
1549	my $self = shift(@_);
1550	my ($stats) = @_;
1551
1552	$stats->{'num_processed'} += $self->{'num_processed'};
1553	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1554	$stats->{'num_archives'} += $self->{'num_archives'};
1555
1556	}
1557
1558	sub associate_cover_image {
1559	my $self = shift;
1560	my ($doc_obj, $filename) = @_;
1561
1562	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1563	if (exists $self->{'covers_missing_cache'}->{$filename}) {
1564	# don't stat() for existence eg for multiple document input files
1565	# (eg SplitPlug)
1566	return;
1567	}
1568
1569	my $top_section=$doc_obj->get_top_section();
1570
1571	if (-e $filename) {
1572	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1573	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1574	} else {
1575	my $upper_filename = $filename;
1576	$upper_filename =~ s/jpg$/JPG/;
1577	if (-e $upper_filename) {
1578	$doc_obj->associate_file($upper_filename, "cover.jpg",
1579	"image/jpeg");
1580	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1581	} else {
1582	# file doesn't exist, so record the fact that it's missing so
1583	# we don't stat() again (stat is slow)
1584	$self->{'covers_missing_cache'}->{$filename} = 1;
1585	}
1586	}
1587
1588	}
1589
1590
1591	# Overridden by exploding plugins (eg. ISISPlug)
1592	sub clean_up_after_exploding
1593	{
1594	my $self = shift(@_);
1595	}
1596
1597
1598	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 11467

Download in other formats: