Context Navigation

source: trunk/gsdl/perllib/plugins/BasPlug.pm@ 6812

Last change on this file since 6812 was 6584, checked in by kjdon, 20 years ago
Fiddled around with segmenting for chinese text. Haven't changed how the segmentation is done, or what character ranges are used. But when its done is now controlled by the collect.cfg. There is a new option, separate_cjk, values true or false, default false. Segmentation is only done if this is set to true. This is passed as a global option to all plugins by the import.pl script, so the user just needs to add it once to the config file, not as an option to all plugins. The queryaction uses this option too to determine whether or not to segment the query.
Property svn:keywords set to `Author Date Id Revision`
File size: 35.4 KB

Line
1	###########################################################################
2	#
3	# BasPlug.pm -- base class for all the import plugins
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package BasPlug;
27
28	eval {require bytes};
29
30	# suppress the annoying "subroutine redefined" warning that various
31	# plugins cause under perl 5.6
32	$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34	use Kea;
35	use parsargv;
36	use multiread;
37	use encodings;
38	use cnseg;
39	use acronym;
40	use textcat;
41	use doc;
42	use diagnostics;
43	use DateExtract;
44	use ghtml;
45	use gsprintf;
46	use printusage;
47
48
49	my $unicode_list =
50	[ { 'name' => "auto",
51	'desc' => "{BasPlug.input_encoding.auto}" },
52	{ 'name' => "ascii",
53	'desc' => "{BasPlug.input_encoding.ascii}" },
54	{ 'name' => "utf8",
55	'desc' => "{BasPlug.input_encoding.utf8}" },
56	{ 'name' => "unicode",
57	'desc' => "{BasPlug.input_encoding.unicode}" } ];
58
59	my $arguments =
60	[ { 'name' => "process_exp",
61	'desc' => "{BasPlug.process_exp}",
62	'type' => "regexp",
63	'deft' => "",
64	'reqd' => "no" },
65	{ 'name' => "block_exp",
66	'desc' => "{BasPlug.block_exp}",
67	'type' => "regexp",
68	'deft' => "",
69	'reqd' => "no" },
70	{ 'name' => "input_encoding",
71	'desc' => "{BasPlug.input_encoding}",
72	'type' => "enum",
73	'list' => $unicode_list,
74	'reqd' => "no" ,
75	'deft' => "auto" } ,
76	{ 'name' => "default_encoding",
77	'desc' => "{BasPlug.default_encoding}",
78	'type' => "enum",
79	'list' => $unicode_list,
80	'reqd' => "no",
81	'deft' => "utf8" },
82	{ 'name' => "extract_language",
83	'desc' => "{BasPlug.extract_language}",
84	'type' => "flag",
85	'reqd' => "no" },
86	{ 'name' => "default_language",
87	'desc' => "{BasPlug.default_language}",
88	'type' => "language",
89	'deft' => "en",
90	'reqd' => "no" },
91	{ 'name' => "extract_acronyms",
92	'desc' => "{BasPlug.extract_acronyms}",
93	'type' => "flag",
94	'reqd' => "no" },
95	{ 'name' => "markup_acronyms",
96	'desc' => "{BasPlug.markup_acronyms}",
97	'type' => "flag",
98	'reqd' => "no" },
99	{ 'name' => "first",
100	'desc' => "{BasPlug.first}",
101	'type' => "string",
102	'reqd' => "no" },
103	{ 'name' => "extract_email",
104	'desc' => "{BasPlug.extract_email}",
105	'type' => "flag",
106	'reqd' => "no" },
107	{ 'name' => "extract_historical_years",
108	'desc' => "{BasPlug.extract_historical_years}",
109	'type' => "flag",
110	'reqd' => "no" },
111	{ 'name' => "maximum_year",
112	'desc' => "{BasPlug.maximum_year}",
113	'type' => "int",
114	'deft' => (localtime)[5]+1900,
115	'reqd' => "no"},
116	{ 'name' => "maximum_century",
117	'desc' => "{BasPlug.maximum_century}",
118	'type' => "int",
119	'deft' => "-1",
120	'reqd' => "no" },
121	{ 'name' => "no_bibliography",
122	'desc' => "{BasPlug.no_bibliography}",
123	'type' => "flag",
124	'reqd' => "no"},
125	{ 'name' => "cover_image",
126	'desc' => "{BasPlug.cover_image}",
127	'type' => "flag",
128	'reqd' => "no" } ];
129
130	my $options = { 'name' => "BasPlug",
131	'desc' => "{BasPlug.desc}",
132	'abstract' => "yes",
133	'inherits' => "no",
134	'args' => $arguments };
135
136
137	sub gsprintf
138	{
139	return &gsprintf::gsprintf(@_);
140	}
141
142
143	sub get_arguments
144	{
145	local $self = shift(@_);
146	local $optionlistref = $self->{'option_list'};
147	local @optionlist = @$optionlistref;
148	local $pluginoptions = pop(@$optionlistref);
149	local $pluginarguments = $pluginoptions->{'args'};
150	return $pluginarguments;
151	}
152
153
154	sub print_xml_usage
155	{
156	local $self = shift(@_);
157	local $language = shift(@_);
158
159	&PrintUsage::print_xml_header();
160	$self->print_xml($language);
161	}
162
163
164	sub print_xml
165	{
166	local $self = shift(@_);
167	local $language = shift(@_);
168
169	local $optionlistref = $self->{'option_list'};
170	local @optionlist = @$optionlistref;
171	local $pluginoptions = pop(@$optionlistref);
172	return if (!defined($pluginoptions));
173
174	print STDERR "<PlugInfo>\n";
175	print STDERR " <Name>$pluginoptions->{'name'}</Name>\n";
176	print STDERR " <Desc>$pluginoptions->{'desc'}</Desc>\n";
177	print STDERR " <Abstract>$pluginoptions->{'abstract'}</Abstract>\n";
178	print STDERR " <Inherits>$pluginoptions->{'inherits'}</Inherits>\n";
179	print STDERR " <Arguments>\n";
180	if (defined($pluginoptions->{'args'})) {
181	&PrintUsage::print_options_xml($language, $pluginoptions->{'args'});
182	}
183
184	# Recurse up the plugin hierarchy
185	$self->print_xml($language);
186
187	print STDERR " </Arguments>\n";
188	print STDERR "</PlugInfo>\n";
189	}
190
191
192	sub print_txt_usage
193	{
194	local $self = shift(@_);
195	local $language = shift(@_);
196
197	# Print the usage message for a plugin (recursively)
198	local $descoffset = $self->determine_description_offset(0);
199	$self->print_plugin_usage($language, $descoffset, 1);
200	}
201
202
203	sub determine_description_offset
204	{
205	local $self = shift(@_);
206	local $maxoffset = shift(@_);
207
208	local $optionlistref = $self->{'option_list'};
209	local @optionlist = @$optionlistref;
210	local $pluginoptions = pop(@$optionlistref);
211	return $maxoffset if (!defined($pluginoptions));
212
213	# Find the length of the longest option string of this plugin
214	local $pluginargs = $pluginoptions->{'args'};
215	if (defined($pluginargs)) {
216	local $longest = &PrintUsage::find_longest_option_string($pluginargs);
217	if ($longest > $maxoffset) {
218	$maxoffset = $longest;
219	}
220	}
221
222	# Recurse up the plugin hierarchy
223	$maxoffset = $self->determine_description_offset($maxoffset);
224	$self->{'option_list'} = \@optionlist;
225	return $maxoffset;
226	}
227
228
229	sub print_plugin_usage
230	{
231	local $self = shift(@_);
232	local $language = shift(@_);
233	local $descoffset = shift(@_);
234	local $isleafclass = shift(@_);
235
236	local $optionlistref = $self->{'option_list'};
237	local @optionlist = @$optionlistref;
238	local $pluginoptions = pop(@$optionlistref);
239	return if (!defined($pluginoptions));
240
241	local $pluginname = $pluginoptions->{'name'};
242	local $pluginargs = $pluginoptions->{'args'};
243
244	# Produce the usage information using the data structure above
245	if ($isleafclass) {
246	&gsprintf(STDERR, " {common.usage}: plugin $pluginname [{common.options}]\n\n");
247	# print STDERR " usage: classify $classifiername [options]\n\n";
248	}
249
250	# Display the plugin options, if there are some
251	if (defined($pluginargs)) {
252	# Calculate the column offset of the option descriptions
253	local $optiondescoffset = $descoffset + 2; # 2 spaces between options & descriptions
254
255	if ($isleafclass) {
256	&gsprintf(STDERR, " {common.specific_options}:\n");
257	# print STDERR " specific options:\n";
258	}
259	else {
260	&gsprintf(STDERR, " {common.general_options}:\n", $pluginname);
261	# print STDERR " general options (from $classifiername):\n";
262	}
263
264	# Display the plugin options
265	&PrintUsage::print_options_txt($language, $pluginargs, $optiondescoffset);
266	}
267
268	# Recurse up the plugin hierarchy
269	$self->print_plugin_usage($language, $descoffset, 0);
270	$self->{'option_list'} = \@optionlist;
271	}
272
273
274	# sub print_general_usage {
275	# my ($plugin_name) = @_;
276
277	# print STDERR "\n usage: plugin $plugin_name [options]\n\n";
278
279	# print STDERR " -process_exp A perl regular expression to match against filenames.\n";
280	# print STDERR " Matching filenames will be processed by this plugin.\n";
281	# print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n";
282	# print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
283	# print STDERR " .htm or .html (case-insensitive).\n\n";
284
285	# print STDERR " -block_exp Files matching this regular expression will be blocked from\n";
286	# print STDERR " being passed to any later plugins in the list. This has no\n";
287	# print STDERR " real effect other than to prevent lots of warning messages\n";
288	# print STDERR " about input files you don't care about. Each plugin might\n";
289	# print STDERR " have a default block_exp. e.g. by default HTMLPlug blocks\n";
290	# print STDERR " any files with .gif, .jpg, .jpeg, .png or .css\n";
291	# print STDERR " file extensions.\n\n";
292
293
294	# print STDERR " -input_encoding The encoding of the source documents. Documents will be\n";
295	# print STDERR " converted from these encodings and stored internally as\n";
296	# print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n";
297	# print STDERR " are:\n";
298
299	# print STDERR " auto: Use text categorization algorithm to automatically\n";
300	# print STDERR " identify the encoding of each source document. This\n";
301	# print STDERR " will be slower than explicitly setting the encoding\n";
302	# print STDERR " but will work where more than one encoding is used\n";
303	# print STDERR " within the same collection.\n";
304
305	# print STDERR " ascii: Plain 7 bit ascii. This may be a bit faster than\n";
306	# print STDERR " using iso_8859_1. Beware of using this on a collection\n";
307	# print STDERR " of documents that may contain characters outside the\n";
308	# print STDERR " plain 7 bit ascii set though (e.g. German or French\n";
309	# print STDERR " documents containing accents), use iso_8859_1 instead.\n";
310
311	# print STDERR " utf8: either utf8 or unicode -- automatically detected\n";
312	# print STDERR " unicode: just unicode\n";
313
314	# my $e = $encodings::encodings;
315	# foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
316	# print STDERR " $enc: $e->{$enc}->{'name'}\n";
317	# }
318	# print STDERR "\n";
319	# print STDERR " -default_encoding Use this encoding if -input_encoding is set to 'auto' and\n";
320	# print STDERR " the text categorization algorithm fails to extract the\n";
321	# print STDERR " encoding or extracts an encoding unsupported by Greenstone.\n";
322	# print STDERR " The default is iso_8859_1.\n\n";
323
324	# print STDERR " -extract_language Identify the language of each document and set 'Language'\n";
325	# print STDERR " metadata. Note that this will be done automatically if\n";
326	# print STDERR " -input_encoding is 'auto'.\n\n";
327	# print STDERR " -default_language If Greenstone fails to work out what language a document is\n";
328	# print STDERR " the 'Language' metadata element will be set to this value.\n";
329	# print STDERR " The default is 'en' (ISO 639 language symbols are used:\n";
330	# print STDERR " en = English). Note that if -input_encoding is not set to\n";
331	# print STDERR " 'auto' and -extract_language is not set, all documents will\n";
332	# print STDERR " have their 'Language' metadata set to this value.\n\n";
333
334	# print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n";
335
336	# print STDERR " -markup_acronyms Add acronym metadata into document text\n\n";
337
338	# print STDERR " -first Comma separated list of first sizes to extract from the\n";
339	# print STDERR " text into a metadata field. The field is called 'FirstNNN'.\n\n";
340
341	# print STDERR " -extract_email Extract email addresses as metadata\n\n";
342
343	# print STDERR " -extract_historical_years Extract time-period information from historical\n";
344	# print STDERR " documents. This is stored as metadata with the document.\n";
345	# print STDERR " There is a search interface for this metadata, which you \n";
346	# print STDERR " can include in your collection by adding the statement:\n";
347	# print STDERR " format QueryInterface DateSearch\n";
348	# print STDERR " to your collection configuration file\n";
349	# print STDERR " -maximum_year The maximum historical date to be used as metadata (in a\n";
350	# print STDERR " Common Era date, such as 1950)\n";
351	# print STDERR " -maximum_century The maximum named century to be extracted as historical\n";
352	# print STDERR " metadata (e.g. 14 will extract all references up to the\n";
353	# print STDERR " 14th century)\n";
354	# print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting\n";
355	# print STDERR " historical dates.\n";
356	# print STDERR " -cover_image Will look for a prefix.jpg file (where prefix is the same\n";
357	# print STDERR " prefix as the file being processed) and associate it as a\n";
358	# print STDERR " cover image\n\n";
359	# }
360
361	# sub print_usage {
362	# print STDERR "\nThis plugin has no plugin specific options\n\n";
363	# }
364
365	sub new {
366	my $class = shift (@_);
367	my $plugin_name = shift (@_);
368	my $self = {};
369	$self->{'plugin_type'} = "BasPlug";
370	my $enc = "^(";
371	map {$enc .= "$_\|";} keys %$encodings::encodings;
372	my $denc = $enc . "ascii\|utf8\|unicode)\$";
373	$enc .= "ascii\|utf8\|unicode\|auto)\$";
374
375	$self->{'outhandle'} = STDERR;
376	my $year = (localtime)[5]+1900;
377
378	$self->{'textcat'} = new textcat();
379
380	$self->{'num_processed'} = 0;
381	$self->{'num_not_processed'} = 0;
382	$self->{'num_blocked'} = 0;
383	$self->{'num_archives'} = 0;
384
385	# 14-05-02 To allow for proper inheritance of arguments - John Thompson
386	$self->{'option_list'} = [ $options ];
387
388	# general options available to all plugins
389	if (!parsargv::parse(\@_,
390	q^process_exp/.*/^, \$self->{'process_exp'},
391	q^block_exp/.*/^, \$self->{'block_exp'},
392	q^extract_language^, \$self->{'extract_language'},
393	q^extract_acronyms^, \$self->{'extract_acronyms'},
394	q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
395	q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
396	qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
397	qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
398	q^extract_email^, \$self->{'extract_email'},
399	q^markup_acronyms^, \$self->{'markup_acronyms'},
400	q^default_language/.{2}/en^, \$self->{'default_language'},
401	q^first/.*/^, \$self->{'first'},
402	q^extract_historical_years^, \$self->{'date_extract'},
403	qq^maximum_year/\\d{4}/$year^, \$self->{'max_year'},
404	q^no_bibliography^, \$self->{'no_biblio'},
405	qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'},
406	q^cover_image^, \$self->{'cover_image'},
407	q^separate_cjk^, \$self->{'separate_cjk'},
408	"allow_extra_options")) {
409
410	&gsprintf(STDERR, "\n{BasPlug.bad_general_option}\n", $plugin_name);
411	$self->print_txt_usage(""); # Use default resource bundle
412	die "\n";
413	}
414
415	return bless $self, $class;
416	}
417
418	# initialize BasPlug options
419	# if init() is overridden in a sub-class, remember to call BasPlug::init()
420	sub init {
421	my $self = shift (@_);
422	my ($verbosity, $outhandle, $failhandle) = @_;
423
424	# verbosity is passed through from the processor
425	$self->{'verbosity'} = $verbosity;
426
427	# as are the outhandle and failhandle
428	$self->{'outhandle'} = $outhandle if defined $outhandle;
429	$self->{'failhandle'} = $failhandle;
430
431	# set process_exp and block_exp to defaults unless they were
432	# explicitly set
433
434	if ((!$self->is_recursive()) and
435	(!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
436
437	$self->{'process_exp'} = $self->get_default_process_exp ();
438	if ($self->{'process_exp'} eq "") {
439	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
440	}
441	}
442
443	if ((!defined $self->{'block_exp'}) \|\| ($self->{'block_exp'} eq "")) {
444	$self->{'block_exp'} = $self->get_default_block_exp ();
445	}
446	}
447
448	sub begin {
449	my $self = shift (@_);
450	my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
451	$self->initialise_extractors();
452	}
453
454	sub end {
455	my ($self) = @_;
456	$self->finalise_extractors();
457	}
458
459	# this function should be overridden to return 1
460	# in recursive plugins
461	sub is_recursive {
462	my $self = shift (@_);
463
464	return 0;
465	}
466
467	sub get_default_block_exp {
468	my $self = shift (@_);
469
470	return "";
471	}
472
473	sub get_default_process_exp {
474	my $self = shift (@_);
475
476	return "";
477	}
478
479	# The BasPlug read() function. This function does all the right things
480	# to make general options work for a given plugin. It calls the process()
481	# function which does all the work specific to a plugin (like the old
482	# read functions used to do). Most plugins should define their own
483	# process() function and let this read() function keep control.
484	#
485	# recursive plugins (e.g. RecPlug) and specialized plugins like those
486	# capable of processing many documents within a single file (e.g.
487	# GMLPlug) should normally implement their own version of read()
488	#
489	# Return number of files processed, undef if can't process
490	# Note that $base_dir might be "" and that $file might
491	# include directories
492
493	sub read {
494	my $self = shift (@_);
495
496	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
497
498	if ($self->is_recursive()) {
499	&gsprintf(STDERR, "{BasPlug.read_must_be_implemented}") && die "\n";
500	}
501
502	my $outhandle = $self->{'outhandle'};
503
504	my $filename = $file;
505	$filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
506
507	if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
508	$self->{'num_blocked'} ++;
509	return 0;
510	}
511	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
512	return undef;
513	}
514	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
515
516	# Do encoding stuff
517	my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
518
519	# create a new document
520	my $doc_obj = new doc ($filename, "indexed_doc");
521	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
522	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
523	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
524	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
525	my ($filemeta) = $file =~ /([^\\\/]+)$/;
526	# how do we know what encoding the filename is in?
527	$doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
528	if ($self->{'cover_image'}) {
529	$self->associate_cover_image($doc_obj, $filename);
530	}
531
532	# read in file ($text will be in utf8)
533	my $text = "";
534	$self->read_file ($filename, $encoding, $language, \$text);
535
536	if (!length ($text)) {
537	my $plugin_name = ref ($self);
538	&gsprintf($outhandle, "$plugin_name: {BasPlug.file_has_no_text}\n", $filename) if $self->{'verbosity'};
539
540	my $failhandle = $self->{'failhandle'};
541	&gsprintf($failhandle, "$file: " . ref($self) . ": {BasPlug.empty_file}\n");
542	# print $failhandle "$file: " . ref($self) . ": file contains no text\n";
543	$self->{'num_not_processed'} ++;
544
545	return 0;
546	}
547
548	# include any metadata passed in from previous plugins
549	# note that this metadata is associated with the top level section
550	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
551
552	# do plugin specific processing of doc_obj
553	return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli));
554
555	# do any automatic metadata extraction
556	$self->auto_extract_metadata ($doc_obj);
557
558	# add an OID
559	# see if there is a plugin-specific set_OID function...
560	if (defined ($self->can(set_OID))) {
561	# it will need $doc_obj to set the Identifier metadata...
562	$self->set_OID($doc_obj);
563	} else {
564	# use the default set_OID() in doc.pm
565	$doc_obj->set_OID();
566	}
567
568	# process the document
569	$processor->process($doc_obj);
570
571	$self->{'num_processed'} ++;
572
573	return 1; # processed the file
574	}
575
576	# returns undef if file is rejected by the plugin
577	sub process {
578	my $self = shift (@_);
579	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
580
581	&gsprintf(STDERR, "BasPlug::process {common.must_be_implemented}\n") && die "\n";
582	# die "Basplug::process function must be implemented in sub-class\n";
583
584	return undef; # never gets here
585	}
586
587	# uses the multiread package to read in the entire file pointed to
588	# by filename and loads the resulting text into $$textref. Input text
589	# may be in any of the encodings handled by multiread, output text
590	# will be in utf8
591	sub read_file {
592	my $self = shift (@_);
593	my ($filename, $encoding, $language, $textref) = @_;
594
595	if (!-r $filename)
596	{
597	my $outhandle = $self->{'outhandle'};
598	&gsprintf($outhandle, "{BasPlug.read_denied}\n", $filename) if $self->{'verbosity'};
599	# print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
600	return;
601	}
602
603	$$textref = "";
604
605	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::read_file {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n");
606	# open (FILE, $filename) \|\| die "BasPlug::read_file could not open $filename for reading ($!)\n";
607
608	if ($encoding eq "ascii") {
609	undef $/;
610	$$textref = <FILE>;
611	$/ = "\n";
612	} else {
613	my $reader = new multiread();
614	$reader->set_handle ('BasPlug::FILE');
615	$reader->set_encoding ($encoding);
616	$reader->read_file ($textref);
617
618	#Now segments chinese if the separate_cjk option is set
619	if ($self->{'separate_cjk'}) {
620	# segment the Chinese words
621	$$textref = &cnseg::segment($$textref);
622	}
623	}
624
625	close FILE;
626	}
627
628	sub textcat_get_language_encoding {
629	my $self = shift (@_);
630	my ($filename) = @_;
631
632	my ($language, $encoding, $extracted_encoding);
633	if ($self->{'input_encoding'} eq "auto") {
634	# use textcat to automatically work out the input encoding and language
635	($language, $encoding) = $self->get_language_encoding ($filename);
636	} elsif ($self->{'extract_language'}) {
637	# use textcat to get language metadata
638	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
639	$encoding = $self->{'input_encoding'};
640	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
641	my $plugin_name = ref ($self);
642	my $outhandle = $self->{'outhandle'};
643	&gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
644	# print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
645	# print $outhandle "appears to be encoded as $extracted_encoding.\n";
646	}
647	} else {
648	$language = $self->{'default_language'};
649	$encoding = $self->{'input_encoding'};
650	}
651	return ($language, $encoding);
652	}
653
654	# Uses textcat to work out the encoding and language of the text in
655	# $filename. All html tags are removed before processing.
656	# returns an array containing "language" and "encoding"
657	sub get_language_encoding {
658	my $self = shift (@_);
659	my ($filename) = @_;
660	my $outhandle = $self->{'outhandle'};
661
662	# read in file
663	open (FILE, $filename) \|\| (&gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
664	undef $/;
665	my $text = <FILE>;
666	$/ = "\n";
667	close FILE;
668
669	# remove <title>stuff</title> -- as titles tend often to be in English
670	# for foreign language documents
671	$text =~ s/<title>.*?<\/title>//i;
672
673	# remove all HTML tags
674	$text =~ s/<[^>]*>//sg;
675
676	# get the language/encoding
677	my $results = $self->{'textcat'}->classify(\$text);
678
679	# if textcat returns 3 or less possibilities we'll use the
680	# first one in the list - otherwise use the defaults
681	if (scalar @$results > 3) {
682	# changed 12 Feb 2003 by jrm21
683	# use the most popular encoding at least... otherwise we might
684	# generate invalid archive files!
685	my %guessed_encodings = ();
686	foreach my $result (@$results) {
687	$result =~ /([^\-]+)$/;
688	my $enc=$1;
689	if (!defined($guessed_encodings{$enc})) {
690	$guessed_encodings{$enc}=0;
691	}
692	$guessed_encodings{$enc}++;
693	}
694	my $best_encoding="";
695	$guessed_encodings{""}=-1;
696	foreach my $enc (keys %guessed_encodings) {
697	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}){
698	$best_encoding=$enc;
699	}
700	}
701
702	if ($self->{'input_encoding'} ne 'auto') {
703	if ($self->{'extract_language'} && $self->{'verbosity'}) {
704	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
705	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
706	# print $outhandle "defaulting to $self->{'default_language'}\n";
707	}
708	return ($self->{'default_language'}, $self->{'input_encoding'});
709
710	} else {
711	if ($self->{'verbosity'}) {
712	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
713	# print $outhandle "BASPlug: WARNING: language could not be extracted from $filename - ";
714	# print $outhandle "defaulting to $self->{'default_language'}.\n";
715	}
716	return ($self->{'default_language'}, $best_encoding);
717	}
718	}
719
720	# format language/encoding
721	my ($language, $encoding) = $results->[0] =~ /^([^-])(?:-(.))?$/;
722	if (!defined $language) {
723	if ($self->{'verbosity'}) {
724	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_language}\n", $filename, $self->{'default_language'});
725	# print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - ";
726	# print $outhandle "defaulting to $self->{'default_language'}\n";
727	}
728	$language = $self->{'default_language'};
729	}
730	if (!defined $encoding) {
731	if ($self->{'verbosity'}) {
732	&gsprintf($outhandle, "BasPlug: {BasPlug.could_not_extract_encoding}\n", $filename, $self->{'default_encoding'});
733	# print $outhandle "BasPlug: WARNING: encoding could not be extracted from $filename - ";
734	# print $outhandle "defaulting to $self->{'default_encoding'}\n";
735	}
736	$encoding = $self->{'default_encoding'};
737	}
738
739	if ($encoding !~ /^(ascii\|utf8\|unicode)$/ &&
740	!defined $encodings::encodings->{$encoding}) {
741	if ($self->{'verbosity'}) {
742	&gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", $filename, $encoding, $self->{'default_encoding'});
743	# print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - ";
744	# print $outhandle "using $self->{'default_encoding'}\n";
745	}
746	$encoding = $self->{'default_encoding'};
747	}
748
749	return ($language, $encoding);
750	}
751
752	# add any extra metadata that's been passed around from one
753	# plugin to another.
754	# extra_metadata uses add_utf8_metadata so it expects metadata values
755	# to already be in utf8
756	sub extra_metadata {
757	my $self = shift (@_);
758	my ($doc_obj, $cursection, $metadata) = @_;
759
760	foreach my $field (keys(%$metadata)) {
761	# $metadata->{$field} may be an array reference
762	if (ref ($metadata->{$field}) eq "ARRAY") {
763	map {
764	$doc_obj->add_utf8_metadata ($cursection, $field, $_);
765	} @{$metadata->{$field}};
766	} else {
767	$doc_obj->add_utf8_metadata ($cursection, $field, $metadata->{$field});
768	}
769	}
770	}
771
772	# initialise metadata extractors
773	sub initialise_extractors {
774	my $self = shift (@_);
775
776	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
777	&acronym::initialise_acronyms();
778	}
779	}
780
781	# finalise metadata extractors
782	sub finalise_extractors {
783	my $self = shift (@_);
784
785	if ($self->{'extract_acronyms'} \|\| $self->{'markup_acronyms'}) {
786	&acronym::finalise_acronyms();
787	}
788	}
789
790	# FIRSTNNN: extract the first NNN characters as metadata
791	sub extract_first_NNNN_characters {
792	my $self = shift (@_);
793	my ($textref, $doc_obj, $thissection) = @_;
794
795	foreach my $size (split /,/, $self->{'first'}) {
796	my $tmptext = $$textref;
797	$tmptext =~ s/^\s+//;
798	$tmptext =~ s/\s+$//;
799	$tmptext =~ s/\s+/ /gs;
800	$tmptext = substr ($tmptext, 0, $size);
801	$tmptext =~ s/\s\S*$/…/;
802	$doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
803	}
804	}
805
806	sub extract_email {
807	my $self = shift (@_);
808	my ($textref, $doc_obj, $thissection) = @_;
809	my $outhandle = $self->{'outhandle'};
810
811	# print $outhandle " extracting email addresses ...\n"
812	&gsprintf($outhandle, " {BasPlug.extracting_emails}...\n")
813	if ($self->{'verbosity'} > 2);
814
815	my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com\|org\|edu\|mil\|int\|net\|[a-z][a-z]))/g);
816	@email = sort @email;
817
818	my @email2 = ();
819	foreach my $address (@email) {
820	if (!(join(" ",@email2) =~ m/$address/ )) {
821	push @email2, $address;
822	$doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
823	# print $outhandle " extracting $address\n"
824	&gsprintf($outhandle, " {BasPlug.extracting} $address\n")
825	if ($self->{'verbosity'} > 3);
826	}
827	}
828	# print $outhandle " done extracting email addresses.\n"
829	&gsprintf($outhandle, " {BasPlug.done_email_extract}\n")
830	if ($self->{'verbosity'} > 2);
831	}
832
833	# extract metadata
834	sub auto_extract_metadata {
835
836	my $self = shift (@_);
837	my ($doc_obj) = @_;
838
839	if ($self->{'extract_email'}) {
840	my $thissection = $doc_obj->get_top_section();
841	while (defined $thissection) {
842	my $text = $doc_obj->get_text($thissection);
843	$self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
844	$thissection = $doc_obj->get_next_section ($thissection);
845	}
846	}
847
848
849	#adding kea keyphrases
850	if ($self->{'kea'}) {
851
852	my $thissection = $doc_obj->get_top_section();
853	my $text = "";
854	my @list;
855
856	while (defined $thissection) { #loop through sections to gather whole doc
857	my $sectiontext = $doc_obj->get_text($thissection);
858	$text = $text.$sectiontext;
859	$thissection = $doc_obj->get_next_section ($thissection);
860	}
861
862	if($self->{'kea_options'}) { #if kea options flag is set, call Kea with specified options
863	@list = &Kea::extract_KeyPhrases ($text, $self->{'kea_options'});
864	} else { #otherwise call Kea with no options
865	@list = &Kea::extract_KeyPhrases ($text);
866	}
867
868	if(@list){ #if a list of kea keyphrases was returned (ie not empty)
869	my $keyphrases = $list[0]; #first arg is keyphrase list
870	my $stems = $list[1]; #second arg is stemmed keyphrase list
871	&gsprintf(STDERR, "{BasPlug.keyphrases}: $keyphrases\n");
872	# print STDERR "keyphrases: $keyphrases\n";
873	&gsprintf(STDERR, "{BasPlug.stems}: $stems\n");
874	# print STDERR "stems: $stems\n";
875	$thissection = $doc_obj->get_top_section(); #add metadata to top section
876	$doc_obj->add_metadata($thissection, "kea", $keyphrases);
877	$doc_obj->add_metadata($thissection, "stems", $stems);
878	}
879	} #end of kea
880
881	if ($self->{'first'}) {
882	my $thissection = $doc_obj->get_top_section();
883	while (defined $thissection) {
884	my $text = $doc_obj->get_text($thissection);
885	$self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
886	$thissection = $doc_obj->get_next_section ($thissection);
887	}
888	}
889
890	if ($self->{'extract_acronyms'}) {
891	my $thissection = $doc_obj->get_top_section();
892	while (defined $thissection) {
893	my $text = $doc_obj->get_text($thissection);
894	$self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./;
895	$thissection = $doc_obj->get_next_section ($thissection);
896	}
897	}
898
899	if ($self->{'markup_acronyms'}) {
900	my $thissection = $doc_obj->get_top_section();
901	while (defined $thissection) {
902	my $text = $doc_obj->get_text($thissection);
903	$text = $self->markup_acronyms ($text, $doc_obj, $thissection);
904	$doc_obj->delete_text($thissection);
905	$doc_obj->add_text($thissection, $text);
906	$thissection = $doc_obj->get_next_section ($thissection);
907	}
908	}
909
910	if($self->{'date_extract'}) {
911	my $thissection = $doc_obj->get_top_section();
912	while (defined $thissection) {
913
914	my $text = $doc_obj->get_text($thissection);
915	&DateExtract::get_date_metadata($text, $doc_obj,
916	$thissection,
917	$self->{'no_biblio'},
918	$self->{'max_year'},
919	$self->{'max_century'});
920	$thissection = $doc_obj->get_next_section ($thissection);
921	}
922	}
923	}
924
925	# extract acronyms from a section in a document. progress is
926	# reported to outhandle based on the verbosity. both the Acronym
927	# and the AcronymKWIC metadata items are created.
928
929	sub extract_acronyms {
930	my $self = shift (@_);
931	my ($textref, $doc_obj, $thissection) = @_;
932	my $outhandle = $self->{'outhandle'};
933
934	# print $outhandle " extracting acronyms ...\n"
935	&gsprintf($outhandle, " {BasPlug.extracting_acronyms}...\n")
936	if ($self->{'verbosity'} > 2);
937
938	my $acro_array = &acronym::acronyms($textref);
939
940	foreach my $acro (@$acro_array) {
941
942	#check that this is the first time ...
943	my $seen_before = "false";
944	my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
945	foreach my $thisAcro (@$previous_data) {
946	if ($thisAcro eq $acro->to_string()) {
947	$seen_before = "true";
948	# print $outhandle " already seen ". $acro->to_string() . "\n"
949	&gsprintf($outhandle, " {BasPlug.already_seen} " . $acro->to_string() . "\n")
950	if ($self->{'verbosity'} >= 4);
951	}
952	}
953
954	if ($seen_before eq "false") {
955	#write it to the file ...
956	$acro->write_to_file();
957
958	#do the normal acronym
959	$doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string());
960	# print $outhandle " adding ". $acro->to_string() . "\n"
961	&gsprintf($outhandle, " {BasPlug.adding} " . $acro->to_string() . "\n")
962	if ($self->{'verbosity'} > 3);
963	}
964	}
965
966	# print $outhandle " done extracting acronyms. \n"
967	&gsprintf($outhandle, " {BasPlug.done_acronym_extract}\n")
968	if ($self->{'verbosity'} > 2);
969	}
970
971	sub markup_acronyms {
972	my $self = shift (@_);
973	my ($text, $doc_obj, $thissection) = @_;
974	my $outhandle = $self->{'outhandle'};
975
976	# print $outhandle " marking up acronyms ...\n"
977	&gsprintf($outhandle, " {BasPlug.marking_up_acronyms}...\n")
978	if ($self->{'verbosity'} > 2);
979
980	#self is passed in to check for verbosity ...
981	$text = &acronym::markup_acronyms($text, $self);
982
983	# print $outhandle " done marking up acronyms. \n"
984	&gsprintf($outhandle, " {BasPlug.done_acronym_markup}\n")
985	if ($self->{'verbosity'} > 2);
986
987	return $text;
988	}
989
990	sub compile_stats {
991	my $self = shift(@_);
992	my ($stats) = @_;
993
994	$stats->{'num_processed'} += $self->{'num_processed'};
995	$stats->{'num_not_processed'} += $self->{'num_not_processed'};
996	$stats->{'num_archives'} += $self->{'num_archives'};
997
998	}
999
1000	sub associate_cover_image {
1001	my $self = shift(@_);
1002	my ($doc_obj, $filename) = @_;
1003
1004	$filename =~ s/\.[^\\\/\.]+$/\.jpg/;
1005	if (-e $filename) {
1006	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1007	} else {
1008	$filename =~ s/jpg$/JPG/;
1009	if (-e $filename) {
1010	$doc_obj->associate_file($filename, "cover.jpg", "image/jpeg");
1011	}
1012	}
1013	}
1014
1015	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: