Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 10606

Last change on this file since 10606 was 10579, checked in by kjdon, 19 years ago
copied classify.pm and BasClas.pm, added -gsdlinfo flag - if this is present (it gets set by pluginfo.pl) then don't bother parsing the args - just set up the arg data structures
Property svn:keywords set to `Author Date Id Revision`
File size: 42.5 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2005 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	BEGIN {
29	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30	}
31
32	eval {require bytes};
33
34	# suppress the annoying "subroutine redefined" warning that various
35	# plugins cause under perl 5.6
36	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
37
38	use strict;
39	no strict 'subs';
40	no strict 'refs'; # allow filehandles to be variables and viceversa
41
42	use File::Basename;
43
44	use Kea;
45	use multiread;
46	use encodings;
47	use cnseg;
48	use acronym;
49	use textcat;
50	use doc;
51	eval "require diagnostics"; # some perl distros (eg mac) don't have this
52	use DateExtract;
53	use ghtml;
54	use gsprintf 'gsprintf';
55	use printusage;
56	use parse2;
57
58
59	use GISBasPlug;
60
61	@BasPlug::ISA = ( GISBasPlug );
62
63	my $unicode_list =
64	[ { 'name' => "ascii",
65	'desc' => "{BasPlug.input_encoding.ascii}" },
66	{ 'name' => "utf8",
67	'desc' => "{BasPlug.input_encoding.utf8}" },
68	{ 'name' => "unicode",
69	'desc' => "{BasPlug.input_encoding.unicode}" } ];
70
71	my $auto_unicode_list =
72	[ { 'name' => "auto",
73	'desc' => "{BasPlug.input_encoding.auto}" } ];
74
75	my $arguments =
76	[ { 'name' => "process_exp",
77	'desc' => "{BasPlug.process_exp}",
78	'type' => "regexp",
79	'deft' => "",
80	'reqd' => "no" },
81	{ 'name' => "block_exp",
82	'desc' => "{BasPlug.block_exp}",
83	'type' => "regexp",
84	'deft' => "",
85	'reqd' => "no" },
86	{ 'name' => "smart_block",
87	'desc' => "{BasPlug.smart_block}",
88	'type' => "flag",
89	'reqd' => "no" },
90	{ 'name' => "associate_ext",
91	'desc' => "{BasPlug.associate_ext}",
92	'type' => "string",
93	'reqd' => "no" },
94	{ 'name' => "input_encoding",
95	'desc' => "{BasPlug.input_encoding}",
96	'type' => "enum",
97	'list' => $auto_unicode_list,
98	'reqd' => "no" ,
99	'deft' => "auto" } ,
100	{ 'name' => "default_encoding",
101	'desc' => "{BasPlug.default_encoding}",
102	'type' => "enum",
103	'list' => $unicode_list,
104	'reqd' => "no",
105	'deft' => "utf8" },
106	{ 'name' => "extract_language",
107	'desc' => "{BasPlug.extract_language}",
108	'type' => "flag",
109	'reqd' => "no" },
110	{ 'name' => "default_language",
111	'desc' => "{BasPlug.default_language}",
112	'type' => "string",
113	'deft' => "en",
114	'reqd' => "no" },
115	{ 'name' => "extract_acronyms",
116	'desc' => "{BasPlug.extract_acronyms}",
117	'type' => "flag",
118	'reqd' => "no" },
119	{ 'name' => "markup_acronyms",
120	'desc' => "{BasPlug.markup_acronyms}",
121	'type' => "flag",
122	'reqd' => "no" },
123	{ 'name' => "extract_keyphrases",
124	'desc' => "{BasPlug.extract_keyphrases}",
125	'type' => "flag",
126	'reqd' => "no" },
127	{ 'name' => "extract_keyphrase_options",
128	'desc' => "{BasPlug.extract_keyphrase_options}",
129	'type' => "string",
130	'deft' => "",
131	'reqd' => "no" },
132	{ 'name' => "first",
133	'desc' => "{BasPlug.first}",
134	'type' => "string",
135	'reqd' => "no" },
136	{ 'name' => "extract_email",
137	'desc' => "{BasPlug.extract_email}",
138	'type' => "flag",
139	'reqd' => "no" },
140	{ 'name' => "extract_historical_years",
141	'desc' => "{BasPlug.extract_historical_years}",
142	'type' => "flag",
143	'reqd' => "no" },
144	{ 'name' => "maximum_year",
145	'desc' => "{BasPlug.maximum_year}",
146	'type' => "int",
147	'deft' => (localtime)[5]+1900,
148	'char_length' => "4",
149	#'range' => "2,100",
150	'reqd' => "no"},
151	{ 'name' => "maximum_century",
152	'desc' => "{BasPlug.maximum_century}",
153	'type' => "string",
154	'deft' => "-1",
155	'reqd' => "no" },
156	{ 'name' => "no_bibliography",
157	'desc' => "{BasPlug.no_bibliography}",
158	'type' => "flag",
159	'reqd' => "no"},
160	{ 'name' => "no_cover_image",
161	'desc' => "{BasPlug.no_cover_image}",
162	'type' => "flag",
163	'reqd' => "no" },
164	{ 'name' => "extract_keyphrases",
165	'desc' => "{BasPlug.extract_keyphrases}",
166	'type' => "flag",
167	'reqd' => "no",
168	'hiddengli' => "yes" },
169	{ 'name' => "extract_keyphrase_options",
170	'desc' => "{BasPlug.extract_keyphrase_options}",
171	'type' => "string",
172	'reqd' => "no",
173	'hiddengli' => "yes" },
174	{ 'name' => "separate_cjk",
175	'desc' => "{BasPlug.separate_cjk}",
176	'type' => "flag",
177	'reqd' => "no",
178	'hiddengli' => "yes" },
179	{ 'name' => "smart_block",
180	'desc' => "{BasPlug.smart_block}",
181	'type' => "flag",
182	'reqd' => "no",
183	'hiddengli' => "yes" },
184	{ 'name' => "new_extract_email",
185	'desc' => "",
186	'type' => "flag",
187	'reqd' => "no",
188	'hiddengli' => "yes" } ];
189
190	my $gis_arguments =
191	[ { 'name' => "extract_placenames",
192	'desc' => "{GISBasPlug.extract_placenames}",
193	'type' => "flag",
194	'reqd' => "no" },
195	{ 'name' => "gazetteer",
196	'desc' => "{GISBasPlug.gazetteer}",
197	'type' => "string",
198	'reqd' => "no" },
199	{ 'name' => "place_list",
200	'desc' => "{GISBasPlug.place_list}",
201	'type' => "flag",
202	'reqd' => "no" } ];
203
204
205	my $options = { 'name' => "BasPlug",
206	'desc' => "{BasPlug.desc}",
207	'abstract' => "yes",
208	'inherits' => "no",
209	'args' => $arguments };
210
211
212	sub set_keepold {
213	my $self = shift(@_);
214	my ($keepold) = @_;
215
216	$self->{'keepold'} = $keepold;
217	}
218
219	sub get_arguments
220	{
221	my $self = shift(@_);
222	my $optionlistref = $self->{'option_list'};
223	my @optionlist = @$optionlistref;
224	my $pluginoptions = pop(@$optionlistref);
225	my $pluginarguments = $pluginoptions->{'args'};
226	return $pluginarguments;
227	}
228
229
230	sub print_xml_usage
231	{
232	my $self = shift(@_);
233
234	# XML output is always in UTF-8
235	gsprintf::output_strings_in_UTF8;
236
237	PrintUsage::print_xml_header();
238	$self->print_xml();
239	}
240
241
242	sub print_xml
243	{
244	my $self = shift(@_);
245
246	my $optionlistref = $self->{'option_list'};
247	my @optionlist = @$optionlistref;
248	my $pluginoptions = shift(@$optionlistref);
249	return if (!defined($pluginoptions));
250
251	gsprintf(STDERR, "<PlugInfo>\n");
252	gsprintf(STDERR, " <Name>$pluginoptions->{'name'}</Name>\n");
253	my $desc = gsprintf::lookup_string($pluginoptions->{'desc'});
254	$desc =~ s/</&lt;/g; # doubly escaped
255	$desc =~ s/>/&gt;/g;
256
257	gsprintf(STDERR, " <Desc>$desc</Desc>\n");
258	gsprintf(STDERR, " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n");
259	gsprintf(STDERR, " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n");
260	gsprintf(STDERR, " <Explodes>" . ($pluginoptions->{'explodes'} \|\| "no") . "</Explodes>\n");
261	gsprintf(STDERR, " <Arguments>\n");
262	if (defined($pluginoptions->{'args'})) {
263	&PrintUsage::print_options_xml($pluginoptions->{'args'});
264	}
265
266	# Recurse up the plugin hierarchy
267	$self->print_xml();
268
269	gsprintf(STDERR, " </Arguments>\n");
270	gsprintf(STDERR, "</PlugInfo>\n");
271	}
272
273
274	sub print_txt_usage
275	{
276	my $self = shift(@_);
277	# Print the usage message for a plugin (recursively)
278	my $descoffset = $self->determine_description_offset(0);
279	$self->print_plugin_usage($descoffset, 1);
280	}
281
282
283	sub determine_description_offset
284	{
285	my $self = shift(@_);
286	my $maxoffset = shift(@_);
287
288	my $optionlistref = $self->{'option_list'};
289	my @optionlist = @$optionlistref;
290	my $pluginoptions = shift(@$optionlistref);
291	return $maxoffset if (!defined($pluginoptions));
292
293	# Find the length of the longest option string of this plugin
294	my $pluginargs = $pluginoptions->{'args'};
295	if (defined($pluginargs)) {
296	my $longest = &PrintUsage::find_longest_option_string($pluginargs);
297	if ($longest > $maxoffset) {
298	$maxoffset = $longest;
299	}
300	}
301
302	# Recurse up the plugin hierarchy
303	$maxoffset = $self->determine_description_offset($maxoffset);
304	$self->{'option_list'} = \@optionlist;
305	return $maxoffset;
306	}
307
308
309	sub print_plugin_usage
310	{
311	my $self = shift(@_);
312	my $descoffset = shift(@_);
313	my $isleafclass = shift(@_);
314
315	my $optionlistref = $self->{'option_list'};
316	my @optionlist = @$optionlistref;
317	my $pluginoptions = shift(@$optionlistref);
318	return if (!defined($pluginoptions));
319
320	my $pluginname = $pluginoptions->{'name'};
321	my $pluginargs = $pluginoptions->{'args'};
322	my $plugindesc = $pluginoptions->{'desc'};
323
324	# Produce the usage information using the data structure above
325	if ($isleafclass) {
326	if (defined($plugindesc)) {
327	gsprintf(STDERR, "$plugindesc\n\n");
328	}
329	gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
330	}
331
332	# Display the plugin options, if there are some
333	if (defined($pluginargs)) {
334	# Calculate the column offset of the option descriptions
335	my $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
336
337	if ($isleafclass) {
338	gsprintf(STDERR, " {common.specific_options}:\n");
339	}
340	else {
341	gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
342	}
343
344	# Display the plugin options
345	&PrintUsage::print_options_txt($pluginargs, $optiondescoffset);
346	}
347
348	# Recurse up the plugin hierarchy
349	$self->print_plugin_usage($descoffset, 0);
350	$self->{'option_list'} = \@optionlist;
351	}
352
353
354	sub new {
355	# Set Encodings to the list!!
356
357	my $e = $encodings::encodings;
358	foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e))
359	{
360	my $hashEncode =
361	{'name' => $enc,
362	'desc' => $e->{$enc}->{'name'}};
363
364	push(@{$unicode_list},$hashEncode);
365	}
366
367	push(@{$auto_unicode_list},@{$unicode_list});
368
369	# Start the BasPlug Constructor
370	my $class = shift (@_);
371	my ($pluginlist,$args,$hashArgOptLists) = @_;
372	push(@$pluginlist, $class);
373	my $plugin_name = (defined $pluginlist->[0]) ? $pluginlist->[0] : $class;
374
375	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
376	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
377
378	if (GISBasPlug::has_mapdata()) {
379	push(@$arguments,@$gis_arguments);
380	}
381
382	my $self = {};
383	$self->{'outhandle'} = STDERR;
384	$self->{'option_list'} = $hashArgOptLists->{"OptList"};
385	$self->{"info_only"} = 0;
386
387	# Check if gsdlinfo is in the argument list or not - if it is, don't parse
388	# the args, just return the object.
389	foreach my $strArg (@{$args})
390	{
391	if($strArg eq "-gsdlinfo")
392	{
393	$self->{"info_only"} = 1;
394	return bless $self, $class;
395	}
396	}
397
398	if(!parse2::parse($args,$hashArgOptLists->{"ArgList"},$self))
399	{
400	my $classTempClass = bless $self, $class;
401	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
402	$classTempClass->print_txt_usage(""); # Use default resource bundle
403	die "\n";
404	}
405
406
407	delete $self->{"info_only"};
408	# else parsing was successful.
409
410	$self->{'plugin_type'} = $plugin_name;
411	#$self->{'outhandle'} = STDERR;
412	$self->{'textcat'} = new textcat();
413	$self->{'num_processed'} = 0;
414	$self->{'num_not_processed'} = 0;
415	$self->{'num_blocked'} = 0;
416	$self->{'num_archives'} = 0;
417	$self->{'cover_image'} = 1; # cover image is on by default
418	$self->{'cover_image'} = 0 if ($self->{'no_cover_image'});
419	$self->{'file_blocks'} = {};
420	#$self->{'option_list'} = $hashArgOptLists->{"OptList"};
421
422	my $associate_ext = $self->{'associate_ext'};
423	if ((defined $associate_ext) && ($associate_ext ne "")) {
424	my @exts = split(/,/,$associate_ext);
425
426	my %associate_ext_lookup = ();
427	foreach my $e (@exts) {
428	$associate_ext_lookup{$e} = 1;
429	}
430
431	$self->{'associate_ext_lookup'} = \%associate_ext_lookup;
432	}
433
434	$self->{'shared_fileroot'} = {};
435	$self->{'file_blocks'} = {};
436
437	if ($self->{'extract_placenames'}) {
438
439	my $outhandle = $self->{'outhandle'};
440
441	my $places_ref
442	= GISBasPlug::loadGISDatabase($outhandle,$self->{'gazetteer'});
443
444	if (!defined $places_ref) {
445	print $outhandle "Warning: Error loading mapdata gazetteer \"$self->{'gazetteer'}\"\n";
446	print $outhandle " No placename extraction will take place.\n";
447	$self->{'extract_placenames'} = undef;
448	}
449	else {
450	$self->{'places'} = $places_ref;
451	}
452	}
453	return bless $self, $class;
454
455	}
456
457	# initialize BasPlug options
458	# if init() is overridden in a sub-class, remember to call BasPlug::init()
459	sub init {
460	my $self = shift (@_);
461	my ($verbosity, $outhandle, $failhandle) = @_;
462
463	# verbosity is passed through from the processor
464	$self->{'verbosity'} = $verbosity;
465
466	# as are the outhandle and failhandle
467	$self->{'outhandle'} = $outhandle if defined $outhandle;
468	$self->{'failhandle'} = $failhandle;
469
470	# set process_exp and block_exp to defaults unless they were
471	# explicitly set
472
473	if ((!$self->is_recursive()) and
474	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
475
476	$self->{'process_exp'} = $self->get_default_process_exp ();
477	if ($self->{'process_exp'} eq "") {
478	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
479	}
480	}
481
482	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
483	$self->{'block_exp'} = $self->get_default_block_exp ();
484	}
485	}
486
487	sub begin {
488	my $self = shift (@_);
489	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
490	$self->initialise_extractors();
491	}
492
493	sub end {
494	# potentially called at the end of each plugin pass
495	# import.pl only has one plugin pass, but buildcol.pl has multiple ones
496
497	my ($self) = @_;
498	$self->finalise_extractors();
499	}
500
501	sub deinit {
502	# called only once, after all plugin passes have been done
503
504	my ($self) = @_;
505	}
506
507	# this function should be overridden to return 1
508	# in recursive plugins
509	sub is_recursive {
510	my $self = shift (@_);
511
512	return 0;
513	}
514
515	sub get_default_block_exp {
516	my $self = shift (@_);
517
518	return "";
519	}
520
521	sub get_default_process_exp {
522	my $self = shift (@_);
523
524	return "";
525	}
526
527	# default implementation is to do nothing.
528	sub store_block_files
529	{
530	my $self =shift (@_);
531	my ($filename) = @_;
532	return;
533	}
534
535	#default implementation is to block a file with same name as this, but extension jpg or JPG, if cover_images is on.
536	sub block_cover_image
537	{
538	my $self =shift (@_);
539	my ($filename) = @_;
540	if ($self->{'cover_image'}) {
541	my $coverfile = $filename;
542	$coverfile =~ s/\.[^\\\/\.]+$/\.jpg/;
543	if (!-e $coverfile) {
544	$coverfile =~ s/jpg$/JPG/;
545	}
546	if (-e $coverfile) {
547	$self->{'file_blocks'}->{$coverfile} = 1;
548	}
549	}
550
551	return;
552	}
553
554	sub metadata_read {
555	my $self = shift (@_);
556	my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
557	# Keep track of filenames with same root but different extensions
558	# Used to support -associate_ext
559
560	my $associate_ext = $self->{'associate_ext'};
561	if ((defined $associate_ext) && ($associate_ext ne "")) {
562
563	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
564	if ((defined $file_prefix) && (defined $file_ext)) {
565
566	my $shared_fileroot = $self->{'shared_fileroot'};
567	if (!defined $shared_fileroot->{$file_prefix}) {
568	my $file_prefix_rec = { 'tie_to' => undef, 'exts' => {} };
569	$shared_fileroot->{$file_prefix} = $file_prefix_rec;
570	}
571
572	my $file_prefix_rec = $shared_fileroot->{$file_prefix};
573
574	my $process_exp = $self->{'process_exp'};
575
576	if ($file =~ m/$self->{'process_exp'}/) {
577	# This is the document the others should be tied to
578	$file_prefix_rec->{'tie_to'} = $file_ext;
579	}
580	else {
581	if (defined $self->{'associate_ext_lookup'}->{$file_ext}) {
582	$file_prefix_rec->{'exts'}->{$file_ext} = 1;
583	}
584	}
585	}
586	}
587
588	# now check whether we are actually processing this
589	my $filename = $file;
590	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
591	if ($self->{'process_exp'} eq "" \|\| $filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
592	return undef; # can't recognise
593	}
594
595	# do smart blocking if appropriate
596	if (defined $self->{'smart_block'}) {
597	$self->block_cover_image($filename);
598	$self->store_block_files($filename);
599	}
600
601	return 1;
602	}
603
604	sub tie_to_filename
605	{
606	my $self = shift (@_);
607
608	my ($file_ext,$file_prefix_rec) = @_;
609
610	if (defined $file_prefix_rec) {
611	my $tie_to = $file_prefix_rec->{'tie_to'};
612
613	if (defined $tie_to) {
614	if ($tie_to eq $file_ext) {
615	return 1;
616	}
617	}
618	}
619
620	return 0;
621	}
622
623	sub tie_to_assoc_file
624	{
625	my $self = shift (@_);
626	my ($file_ext,$file_prefix_rec) = @_;
627
628	if (defined $file_prefix_rec) {
629	my $tie_to = $file_prefix_rec->{'tie_to'};
630	if (defined $tie_to) {
631
632	my $exts = $file_prefix_rec->{'exts'};
633
634	my $has_file_ext = $exts->{$file_ext};
635
636	if ($has_file_ext) {
637	return 1;
638	}
639	}
640	}
641
642	return 0;
643	}
644
645
646	sub associate_with
647	{
648	my $self = shift (@_);
649	my ($file, $filename, $metadata) = @_;
650
651	my $associate_ext = $self->{'associate_ext'};
652
653
654	return 0 if (!$associate_ext);
655
656	# If file, see if matches with "tie_to" doc or is one of the
657	# associated filename extensions.
658
659	my ($file_prefix,$file_ext) = ($file =~ m/^(.)\.(.?)$/);
660	if ((defined $file_prefix) && (defined $file_ext)) {
661
662	my $file_prefix_rec = $self->{'shared_fileroot'}->{$file_prefix};
663
664	if ($self->tie_to_filename($file_ext,$file_prefix_rec)) {
665
666	# Set up gsdlassocfile_tobe
667
668	my $exts = $file_prefix_rec->{'exts'};
669
670	if (!defined $metadata->{'gsdlassocfile_tobe'}) {
671	$metadata->{'gsdlassocfile_tobe'} = [];
672	}
673
674	my $assoc_tobe = $metadata->{'gsdlassocfile_tobe'};
675
676	my ($full_prefix) = ($filename =~ m/^(.)\..?$/);
677	foreach my $e (keys %$exts) {
678	my $assoc_file = "$full_prefix.$e";
679	my $mime_type = ""; # let system auto detect this
680	push(@$assoc_tobe,"$assoc_file:$mime_type:");
681	}
682	}
683	elsif ($self->tie_to_assoc_file($file_ext,$file_prefix_rec)) {
684	# a form of smart block
685
686	return 1;
687	}
688	}
689
690	return 0;
691	}
692
693
694	sub read_block {
695	my $self = shift (@_);
696
697	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
698
699
700	my $filename = $file;
701	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
702
703	if ($self->associate_with($file,$filename,$metadata)) {
704	# a form of smart block
705	$self->{'num_blocked'} ++;
706	return (0,undef); # blocked
707	}
708
709	my $smart_block = $self->{'smart_block'};
710	my $smart_block_BN = $self->{'smart_block_BN'};
711
712	if ($smart_block \|\| $smart_block_BN) {
713	if (defined $self->{'file_blocks'}->{$filename} && $self->{'file_blocks'}->{$filename} == 1){
714	$self->{'num_blocked'} ++;
715	return (0,undef); # blocked
716	}
717	} elsif ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
718	$self->{'num_blocked'} ++;
719	return (0,undef); # blocked
720	}
721
722	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
723	return (undef,undef); # can't recognise
724	}
725
726	return (1,$filename);
727	}
728
729	sub read_tidy_file {
730
731	my $self = shift (@_);
732
733	my ($file) = @_;
734
735	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
736
737	return $file;
738	}
739
740
741
742	# The BasPlug read_into_doc_obj() function. This function does all the
743	# right things to make general options work for a given plugin. It reads in
744	# a file and sets up a slew of metadata all saved in doc_obj, which
745	# it then returns as part of a tuple (process_status,doc_obj)
746	#
747	# Much of this functionality used to reside in read, but it was broken
748	# down into a supporting routine to make the code more flexible.
749	#
750	# recursive plugins (e.g. RecPlug) and specialized plugins like those
751	# capable of processing many documents within a single file (e.g.
752	# GMLPlug) will normally want to implement their own version of
753	# read_into_doc_obj()
754	#
755	# Note that $base_dir might be "" and that $file might
756	# include directories
757	sub read_into_doc_obj {
758	my $self = shift (@_);
759	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
760
761	if ($self->is_recursive()) {
762	gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
763	}
764
765	my $outhandle = $self->{'outhandle'};
766
767	my ($block_status,$filename) = $self->read_block(@_);
768	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
769	$file = $self->read_tidy_file($file);
770
771	# Do encoding stuff
772	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
773
774	# create a new document
775	my $doc_obj = new doc ($filename, "indexed_doc");
776	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
777	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
778	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
779	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
780	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
781
782	my ($filemeta) = $file =~ /([^\\\/]+)$/;
783	# how do we know what encoding the filename is in?
784	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
785	if ($self->{'cover_image'}) {
786	$self->associate_cover_image($doc_obj, $filename);
787	}
788
789	# read in file ($text will be in utf8)
790	my $text = "";
791	$self->read_file ($filename, $encoding, $language, \$text);
792
793	if (!length ($text)) {
794	my $plugin_name = ref ($self);
795	if ($gli) {
796	print STDERR "<ProcessingError n='$file' r='File contains no text'>\n";
797	}
798	gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
799
800	my $failhandle = $self->{'failhandle'};
801	gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
802	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
803	$self->{'num_not_processed'} ++;
804
805	return (0,undef); # what should we return here?? error but don't want to pass it on
806	}
807
808	# include any metadata passed in from previous plugins
809	# note that this metadata is associated with the top level section
810
811	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
812
813	# do plugin specific processing of doc_obj
814	unless (defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli))) {
815	$text = '';
816	undef $text;
817	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
818	return (-1,undef);
819	}
820	$text='';
821	undef $text;
822
823	# do any automatic metadata extraction
824	$self->auto_extract_metadata ($doc_obj);
825
826	# add an OID
827	# see if there is a plugin-specific set_OID function...
828	if (defined ($self->can('set_OID'))) {
829	# it will need $doc_obj to set the Identifier metadata...
830	$self->set_OID($doc_obj);
831	} else {
832	# use the default set_OID() in doc.pm
833	$doc_obj->set_OID();
834	}
835
836	return (1,$doc_obj);
837	}
838
839
840	# The BasPlug read() function. This function calls read_into_doc_obj()
841	# to ensure all the right things to make general options work for a
842	# given plugin are done. It then calls the process() function which
843	# does all the work specific to a plugin (like the old read functions
844	# used to do). Most plugins should define their own process() function
845	# and let this read() function keep control.
846	#
847	# recursive plugins (e.g. RecPlug) and specialized plugins like those
848	# capable of processing many documents within a single file (e.g.
849	# GMLPlug) might want to implement their own version of read(), but
850	# more likely need to implement their own version of read_into_doc_obj()
851	#
852	# Return number of files processed, undef if can't recognise, -1 if can't
853	# process
854
855	sub read {
856	my $self = shift (@_);
857	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
858
859	my ($process_status,$doc_obj) = $self->read_into_doc_obj(@_);
860
861	if ((defined $process_status) && ($process_status == 1)) {
862	# process the document
863	$processor->process($doc_obj);
864
865	if(defined($self->{'places_filename'})){
866	&util::rm($self->{'places_filename'});
867	$self->{'places_filename'} = undef;
868	}
869
870	$self->{'num_processed'} ++;
871	undef $doc_obj;
872	}
873
874	# if process_status == 1, then the file has been processed.
875	return $process_status;
876
877	}
878
879	# returns undef if file is rejected by the plugin
880	sub process {
881	my $self = shift (@_);
882	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
883
884	gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
885	# die "Basplug::process function must be implemented in sub-class\n";
886
887	return undef; # never gets here
888	}
889
890	# uses the multiread package to read in the entire file pointed to
891	# by filename and loads the resulting text into $$textref. Input text
892	# may be in any of the encodings handled by multiread, output text
893	# will be in utf8
894	sub read_file {
895	my $self = shift (@_);
896	my ($filename, $encoding, $language, $textref) = @_;
897
898	if (!-r $filename)
899	{
900	my $outhandle = $self->{'outhandle'};
901	gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
902	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
903	return;
904	}
905	$$textref = "";
906	if (!open (FILE, $filename)) {
907	gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
908	die "\n";
909	}
910
911	if ($encoding eq "ascii") {
912	undef $/;
913	$$textref = <FILE>;
914	$/ = "\n";
915	} else {
916	my $reader = new multiread();
917	$reader->set_handle ('BasPlug::FILE');
918	$reader->set_encoding ($encoding);
919	$reader->read_file ($textref);
920	#Now segments chinese if the separate_cjk option is set
921	if ($self->{'separate_cjk'}) {
922	# segment the Chinese words
923	$$textref = &cnseg::segment($$textref);
924	}
925	}
926	close FILE;
927	}
928
929	# write_file -- used by ConvertToPlug, for example in post processing
930	#
931	sub utf8_write_file {
932	my $self = shift (@_);
933	my ($textref, $filename) = @_;
934
935	if (!open (FILE, ">$filename")) {
936	gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename);
937	die "\n";
938	}
939	print FILE $$textref;
940
941	close FILE;
942	}
943
944
945	sub filename_based_title
946	{
947	my $self = shift (@_);
948	my ($file) = @_;
949
950	my $file_derived_title = $file;
951	$file_derived_title =~ s/_/ /g;
952	$file_derived_title =~ s/\..*?$//;
953
954	return $file_derived_title;
955	}
956
957
958	sub title_fallback
959	{
960	my $self = shift (@_);
961	my ($doc_obj,$section,$file) = @_;
962
963	if (!defined $doc_obj->get_metadata_element ($section, "Title")) {
964
965	my $file_derived_title = $self->filename_based_title($file);
966	$doc_obj->add_metadata ($section, "Title", $file_derived_title);
967	}
968	}
969
970	sub textcat_get_language_encoding {
971	my $self = shift (@_);
972	my ($filename) = @_;
973
974
975	my ($language, $encoding, $extracted_encoding);
976	if ($self->{'input_encoding'} eq "auto") {
977	# use textcat to automatically work out the input encoding and language
978	($language, $encoding) = $self->get_language_encoding ($filename);
979	} elsif ($self->{'extract_language'}) {
980	# use textcat to get language metadata
981	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
982	$encoding = $self->{'input_encoding'};
983	# don't print this message for english... english in utf8 is identical
984	# to english in iso-8859-1 (except for some punctuation). We don't have
985	# a language model for en_utf8, so textcat always says iso-8859-1!
986	if ($extracted_encoding ne $encoding && $language ne "en"
987	&& $self->{'verbosity'}) {
988	my $plugin_name = ref ($self);
989	my $outhandle = $self->{'outhandle'};
990	gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
991	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
992	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
993	}
994	} else {
995	$language = $self->{'default_language'};
996	$encoding = $self->{'input_encoding'};
997	}
998
999	return ($language, $encoding);
1000	}
1001
1002	# Uses textcat to work out the encoding and language of the text in
1003	# $filename. All html tags are removed before processing.
1004	# returns an array containing "language" and "encoding"
1005	sub get_language_encoding {
1006	my $self = shift (@_);
1007	my ($filename) = @_;
1008	my $outhandle = $self->{'outhandle'};
1009	my $unicode_format = "";
1010	# read in file
1011	open (FILE, $filename) \|\| (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
1012	undef $/;
1013	my $text = <FILE>;
1014	$/ = "\n";
1015	close FILE;
1016
1017	# check if first few bytes have a Byte Order Marker
1018	my $bom=substr($text,0,2); # check 16bit unicode
1019	if ($bom eq "\xff\xfe") { # little endian 16bit unicode
1020	$unicode_format="unicode";
1021	} elsif ($bom eq "\xfe\xff") { # big endian 16bit unicode
1022	$unicode_format="unicode";
1023	} else {
1024	$bom=substr($text,0,3); # check utf-8
1025	if ($bom eq "\xef\xbb\xbf") { # utf-8 coded FEFF bom
1026	$unicode_format="utf8";
1027	# } elsif ($bom eq "\xef\xbf\xbe") { # utf-8 coded FFFE bom. Error!?
1028	# $unicode_format="utf8";
1029	}
1030	}
1031
1032	# VB scripting generated Word to HTML file
1033	if ($text =~ /charset=(windows.*)[\"]/ig){
1034	my $vbhtml_encoding = $1;
1035	$vbhtml_encoding =~ s/-+/_/g;
1036	$self->{'input_encoding'} = $vbhtml_encoding;
1037	}
1038
1039	# remove <title>stuff</title> -- as titles tend often to be in English
1040	# for foreign language documents
1041	$text =~ s/<title>(.\|\n)*?<\/title>//i;
1042
1043	# remove all HTML tags
1044	# XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
1045	if (ref($self) eq 'HTMLPlug' \|\|
1046	(exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
1047	$text =~ s/<[^>]*>//sg;
1048	}
1049
1050	# get the language/encoding
1051	my $results = $self->{'textcat'}->classify(\$text);
1052
1053	# if textcat returns 3 or less possibilities we'll use the
1054	# first one in the list - otherwise use the defaults
1055	if (scalar @$results > 3) {
1056	my $best_encoding="";
1057	if ($unicode_format) { # in case the first had a BOM
1058	$best_encoding=$unicode_format;
1059	} else {
1060	my %guessed_encodings = ();
1061	foreach my $result (@$results) {
1062	$result =~ /([^\-]+)$/;
1063	my $enc=$1;
1064	if (!defined($guessed_encodings{$enc})) {
1065	$guessed_encodings{$enc}=0;
1066	}
1067	$guessed_encodings{$enc}++;
1068	}
1069
1070	$guessed_encodings{""}=-1; # for default best_encoding of ""
1071	foreach my $enc (keys %guessed_encodings) {
1072	if ($guessed_encodings{$enc} >
1073	$guessed_encodings{$best_encoding}){
1074	$best_encoding=$enc;
1075	}
1076	}
1077	}
1078
1079	if ($self->{'input_encoding'} ne 'auto') {
1080	if ($self->{'extract_language'} && ($self->{'verbosity'}>2)) {
1081	gsprintf($outhandle,
1082	"BasPlug: {BasPlug.could_not_extract_language}\n",
1083	$filename, $self->{'default_language'});
1084	}
1085	return ($self->{'default_language'}, $self->{'input_encoding'});
1086
1087	} else {
1088	if ($self->{'verbosity'}>2) {
1089	gsprintf($outhandle,
1090	"BasPlug: {BasPlug.could_not_extract_language}\n",
1091	$filename, $self->{'default_language'});
1092	}
1093	return ($self->{'default_language'}, $best_encoding);
1094	}
1095	}
1096
1097	# format language/encoding
1098	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
1099	if (!defined $language) {
1100	if ($self->{'verbosity'}>2) {
1101	gsprintf($outhandle,
1102	"BasPlug: {BasPlug.could_not_extract_language}\n",
1103	$filename, $self->{'default_language'});
1104	}
1105	$language = $self->{'default_language'};
1106	}
1107	if (!defined $encoding) {
1108	if ($self->{'verbosity'}>2) {
1109	gsprintf($outhandle,
1110	"BasPlug: {BasPlug.could_not_extract_encoding}\n",
1111	$filename, $self->{'default_encoding'});
1112	}
1113	$encoding = $self->{'default_encoding'};
1114	}
1115
1116
1117	# check for equivalents where textcat doesn't have some encodings...
1118	# eg MS versions of standard encodings
1119	if ($encoding =~ /^iso_8859_(\d+)/) {
1120	my $iso = $1; # which variant of the iso standard?
1121	# iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
1122	if ($text =~ /[\x80-\x9f]/) {
1123	# Western Europe
1124	if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
1125	elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
1126	elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
1127	elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
1128	elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
1129	elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
1130	elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
1131	}
1132	}
1133
1134	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
1135	!defined $encodings::encodings->{$encoding}) {
1136	if ($self->{'verbosity'}) {
1137	gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
1138	$filename, $encoding, $self->{'default_encoding'});
1139	}
1140	$encoding = $self->{'default_encoding'};
1141	}
1142
1143	return ($language, $encoding);
1144	}
1145
1146	# add any extra metadata that's been passed around from one
1147	# plugin to another.
1148	# extra_metadata uses add_utf8_metadata so it expects metadata values
1149	# to already be in utf8
1150	sub extra_metadata {
1151	my $self = shift (@_);
1152	my ($doc_obj, $cursection, $metadata) = @_;
1153
1154	foreach my $field (keys(%$metadata)) {
1155	# $metadata->{$field} may be an array reference
1156	if ($field eq "gsdlassocfile_tobe") {
1157	# 'gsdlassocfile_tobe' is artificially introduced metadata
1158	# that is used to signal that certain additional files should
1159	# be tied to this document. Useful in situations where a
1160	# metadata pass in the plugin pipeline works out some files
1161	# need to be associated with a document, but the document hasn't
1162	# been formed yet.
1163
1164	my $equiv_form = "";
1165	foreach my $gaf (@{$metadata->{$field}}) {
1166	my ($full_filename,$mimetype) = ($gaf =~ m/^(.):(.):$/);
1167	my ($tail_filename) = ($full_filename =~ /^.*[\/\\](.+?)$/);
1168	my $filename = $full_filename;
1169
1170	$doc_obj->associate_file($full_filename,$tail_filename,$mimetype);
1171
1172	my ($doc_ext) = ($tail_filename =~ m/^.\.(.)$/);
1173	my $start_doclink = "<a href=\"_httpcollection_/index/assoc/{Or}{[parent(Top):archivedir],[archivedir]}/$tail_filename\">";
1174	my $srcicon = "_icon".$doc_ext."_";
1175	my $end_doclink = "</a>";
1176
1177	$equiv_form .= " $start_doclink\{If\}{$srcicon,$srcicon,$doc_ext\}$end_doclink";
1178	}
1179	$doc_obj->add_utf8_metadata ($cursection, "equivlink", $equiv_form);
1180	}
1181	elsif (ref ($metadata->{$field}) eq "ARRAY") {
1182	map {
1183	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
1184	} @{$metadata->{$field}};
1185	} else {
1186	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
1187	}
1188	}
1189	}
1190
1191	# initialise metadata extractors
1192	sub initialise_extractors {
1193	my $self = shift (@_);
1194
1195	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1196	&acronym::initialise_acronyms();
1197	}
1198	}
1199
1200	# finalise metadata extractors
1201	sub finalise_extractors {
1202	my $self = shift (@_);
1203
1204	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
1205	&acronym::finalise_acronyms();
1206	}
1207	}
1208
1209	# FIRSTNNN: extract the first NNN characters as metadata
1210	sub extract_first_NNNN_characters {
1211	my $self = shift (@_);
1212	my ($textref, $doc_obj, $thissection) = @_;
1213
1214	foreach my $size (split /,/, $self->{'first'}) {
1215	my $tmptext = $$textref;
1216	$tmptext =~ s/^\s+//;
1217	$tmptext =~ s/\s+$//;
1218	$tmptext =~ s/\s+/ /gs;
1219	$tmptext = substr ($tmptext, 0, $size);
1220	$tmptext =~ s/\s\S*$/…/;
1221	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
1222	}
1223	}
1224
1225	sub extract_email {
1226	my $self = shift (@_);
1227	my ($textref, $doc_obj, $thissection) = @_;
1228	my $outhandle = $self->{'outhandle'};
1229
1230	gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
1231	if ($self->{'verbosity'} > 2);
1232
1233	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
1234	@email = sort @email;
1235
1236	# if($self->{"new_extract_email"} == 0)
1237	# {
1238	# my @email2 = ();
1239	# foreach my $address (@email)
1240	# {
1241	# if (!(join(" ",@email2) =~ m/(^\| )$address( \|$)/ ))
1242	# {
1243	# push @email2, $address;
1244	# $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1245	# # print $outhandle " extracting $address\n"
1246	# &gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1247	# if ($self->{'verbosity'} > 3);
1248	# }
1249	# }
1250	# }
1251	# else
1252	# {
1253	my $hashExistMail = {};
1254	foreach my $address (@email) {
1255	if (!(defined $hashExistMail->{$address}))
1256	{
1257	$hashExistMail->{$address} = 1;
1258	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
1259	gsprintf($outhandle, " {BasPlug.extracting} $address\n")
1260	if ($self->{'verbosity'} > 3);
1261	}
1262	}
1263	gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
1264	if ($self->{'verbosity'} > 2);
1265	}
1266
1267	# extract metadata
1268	sub auto_extract_metadata {
1269
1270	my $self = shift (@_);
1271	my ($doc_obj) = @_;
1272
1273	if ($self->{'extract_email'}) {
1274	my $thissection = $doc_obj->get_top_section();
1275	while (defined $thissection) {
1276	my $text = $doc_obj->get_text($thissection);
1277	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
1278	$thissection = $doc_obj->get_next_section ($thissection);
1279	}
1280	}
1281	if ($self->{'extract_placenames'}) {
1282	my $thissection = $doc_obj->get_top_section();
1283	while (defined $thissection) {
1284	my $text = $doc_obj->get_text($thissection);
1285	$self->extract_placenames (\$text, $doc_obj, $thissection) if $text =~ /./;
1286	$thissection = $doc_obj->get_next_section ($thissection);
1287	}
1288	}
1289
1290	#adding kea keyphrases
1291	if ($self->{'extract_keyphrases'}) {
1292
1293	my $thissection = $doc_obj->get_top_section();
1294	my $text = "";
1295	my $list;
1296
1297	#loop through sections to gather whole doc
1298	while (defined $thissection) {
1299	my $sectiontext = $doc_obj->get_text($thissection);
1300	$text = $text.$sectiontext;
1301	$thissection = $doc_obj->get_next_section ($thissection);
1302	}
1303
1304	if($self->{'extract_keyphrase_options'}) { #if kea options flag is set, call Kea with specified options
1305	$list = &Kea::extract_KeyPhrases ($text, $self->{'extract_keyphrase_options'});
1306	} else { #otherwise call Kea with no options
1307	$list = &Kea::extract_KeyPhrases ($text);
1308	}
1309
1310	if ($list){
1311	# if a list of kea keyphrases was returned (ie not empty)
1312	if ($self->{'verbosity'}) {
1313	gsprintf(STDERR, "{BasPlug.keyphrases}: $list\n");
1314	}
1315
1316	#add metadata to top section
1317	$thissection = $doc_obj->get_top_section();
1318
1319	# add all key phrases as one metadata
1320	$doc_obj->add_metadata($thissection, "Keyphrases", $list);
1321
1322	# add individual key phrases as multiple metadata
1323	foreach my $keyphrase (split(',', $list)) {
1324	$keyphrase =~ s/^\s+\|\s+$//g;
1325	$doc_obj->add_metadata($thissection, "Keyphrase", $keyphrase);
1326	}
1327	}
1328	} #end of kea
1329
1330	if ($self->{'first'}) {
1331	my $thissection = $doc_obj->get_top_section();
1332	while (defined $thissection) {
1333	my $text = $doc_obj->get_text($thissection);
1334	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
1335	$thissection = $doc_obj->get_next_section ($thissection);
1336	}
1337	}
1338
1339	if ($self->{'extract_acronyms'}) {
1340	my $thissection = $doc_obj->get_top_section();
1341	while (defined $thissection) {
1342	my $text = $doc_obj->get_text($thissection);
1343	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
1344	$thissection = $doc_obj->get_next_section ($thissection);
1345	}
1346	}
1347
1348	if ($self->{'markup_acronyms'}) {
1349	my $thissection = $doc_obj->get_top_section();
1350	while (defined $thissection) {
1351	my $text = $doc_obj->get_text($thissection);
1352	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
1353	$doc_obj->delete_text($thissection);
1354	$doc_obj->add_text($thissection, $text);
1355	$thissection = $doc_obj->get_next_section ($thissection);
1356	}
1357	}
1358
1359	if($self->{'extract_historical_years'}) {
1360	my $thissection = $doc_obj->get_top_section();
1361	while (defined $thissection) {
1362
1363	my $text = $doc_obj->get_text($thissection);
1364	&DateExtract::get_date_metadata($text, $doc_obj,
1365	$thissection,
1366	$self->{'no_bibliography'},
1367	$self->{'maximum_year'},
1368	$self->{'maximum_century'});
1369	$thissection = $doc_obj->get_next_section ($thissection);
1370	}
1371	}
1372	}
1373
1374	# extract acronyms from a section in a document. progress is
1375	# reported to outhandle based on the verbosity. both the Acronym
1376	# and the AcronymKWIC metadata items are created.
1377
1378	sub extract_acronyms {
1379	my $self = shift (@_);
1380	my ($textref, $doc_obj, $thissection) = @_;
1381	my $outhandle = $self->{'outhandle'};
1382
1383	# print $outhandle " extracting acronyms ...\n"
1384	gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
1385	if ($self->{'verbosity'} > 2);
1386
1387	my $acro_array = &acronym::acronyms($textref);
1388
1389	foreach my $acro (@$acro_array) {
1390
1391	#check that this is the first time ...
1392	my $seen_before = "false";
1393	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
1394	foreach my $thisAcro (@$previous_data) {
1395	if ($thisAcro eq $acro->to_string()) {
1396	$seen_before = "true";
1397	if ($self->{'verbosity'} >= 4) {
1398	gsprintf($outhandle, " {BasPlug.already_seen} " .
1399	$acro->to_string() . "\n");
1400	}
1401	}
1402	}
1403
1404	if ($seen_before eq "false") {
1405	#write it to the file ...
1406	$acro->write_to_file();
1407
1408	#do the normal acronym
1409	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
1410	gsprintf($outhandle, " {BasPlug.adding} ".$acro->to_string()."\n")
1411	if ($self->{'verbosity'} > 3);
1412	}
1413	}
1414
1415	gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
1416	if ($self->{'verbosity'} > 2);
1417	}
1418
1419	sub markup_acronyms {
1420	my $self = shift (@_);
1421	my ($text, $doc_obj, $thissection) = @_;
1422	my $outhandle = $self->{'outhandle'};
1423
1424	gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
1425	if ($self->{'verbosity'} > 2);
1426
1427	#self is passed in to check for verbosity ...
1428	$text = &acronym::markup_acronyms($text, $self);
1429
1430	gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
1431	if ($self->{'verbosity'} > 2);
1432
1433	return $text;
1434	}
1435
1436	sub compile_stats {
1437	my $self = shift(@_);
1438	my ($stats) = @_;
1439
1440	$stats->{'num_processed'} += $self->{'num_processed'};
1441	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
1442	$stats->{'num_archives'} += $self->{'num_archives'};
1443
1444	}
1445
1446	sub associate_cover_image {
1447	my $self = shift(@_);
1448	my ($doc_obj, $filename) = @_;
1449
1450	my $top_section=$doc_obj->get_top_section();
1451
1452	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1453	if (-e $filename) {
1454	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1455	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1456	} else {
1457	$filename =~ s/jpg$/JPG/;
1458	if (-e $filename) {
1459	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1460	$doc_obj->add_utf8_metadata($top_section, "hascover", 1);
1461	}
1462	}
1463	}
1464
1465	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: