Context Navigation

source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 2484

Last change on this file since 2484 was 2484, checked in by say1, 23 years ago
Changed SplitPlug to allow control over the OID. Changed BibTexPlug to be more permissive in the Bibtex format it accepts. Changed BibTexPlug to use the BibTex key as the OID.
Property svn:keywords set to `Author Date Id Revision`
File size: 6.4 KB

Line
1	###########################################################################
2	#
3	# SplitPlug.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitPlug is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40	package SplitPlug;
41
42	use BasPlug;
43	use util;
44
45
46	# SplitPlug is a sub-class of BasPlug.
47	sub BEGIN {
48	@ISA = ('BasPlug');
49	}
50
51	sub new {
52	my ($class) = @_;
53	$self = new BasPlug($class, @_);
54
55	if (!parsargv::parse(\@_,
56	q^split_exp/.*/^, \$self->{'split_exp'},
57	"allow_extra_options")) {
58	print STDERR "\nIncorrect options passed to $class.";
59	print STDERR "\nCheck your collect.cfg configuration file\n";
60	die "\n";
61	}
62
63	return bless $self, $class;
64	}
65
66	sub init {
67	my $self = shift (@_);
68	my ($verbosity, $outhandle) = @_;
69
70	$self->BasPlug::init($verbosity, $outhandle);
71
72	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
73
74	$self->{'process_exp'} = $self->get_default_process_exp ();
75	if ($self->{'process_exp'} eq "") {
76	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
77	}
78	}
79
80
81	# set split_exp to default unless explicitly set
82	if (!$self->{'split_exp'}) {
83	$self->{'split_exp'} = $self->get_default_split_exp ();
84	}
85
86	}
87
88	# This plugin recurs over the segments it finds
89	sub is_recursive {
90	return 1;
91	}
92
93	# By default, we split the input text at blank lines
94	sub get_default_split_exp {
95	return q^\n\s*\n^;
96	}
97
98
99	# The read function opens a file and splits it into parts.
100	# Each part is sent to the process function
101	#
102	# Returns: Number of document objects created (or undef if it fails)
103
104	sub read {
105	my $self = shift (@_);
106	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
107	my $outhandle = $self->{'outhandle'};
108	my $verbosity = $self->{'verbosity'};
109
110	# Figure out the exact filename of this file (and maybe block it)
111	my $filename = &util::filename_cat($base_dir, $file);
112	my $block_exp = $self->{'block_exp'};
113	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
114	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
115	return undef;
116	}
117	my $plugin_name = ref ($self);
118	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
119
120	my ($language, $encoding);
121	if ($self->{'input_encoding'} eq "auto") {
122	# use textcat to automatically work out the input encoding and language
123	($language, $encoding) = $self->get_language_encoding ($filename);
124
125	} elsif ($self->{'extract_language'}) {
126	# use textcat to get language metadata
127	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
128	$encoding = $self->{'input_encoding'};
129
130	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
131	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
132	print $outhandle "appears to be encoded as $extracted_encoding.\n";
133	}
134
135	} else {
136	$language = $self->{'default_language'};
137	$encoding = $self->{'input_encoding'};
138	}
139
140	# Read in file ($text will be in utf8)
141	my $text = "";
142	$self->read_file ($filename, $encoding, \$text);
143
144	if ($text !~ /\w/) {
145	my $outhandle = $self->{'outhandle'};
146	print $outhandle "$plugin_name: ERROR: $file contains no text\n"
147	if $self->{'verbosity'};
148	return 0;
149	}
150
151
152	# Split the text into several smaller segments
153	my $split_exp = $self->{'split_exp'};
154	my @segments = split(/$split_exp/, $text);
155	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
156	if $self->{'verbosity'};
157
158	# Process each segment in turn
159	my ($count, $segment, $segtext, $status, $id);
160	$segment = 0;
161	$count = 0;
162	foreach $segtext (@segments) {
163	$segment++;
164
165	# create a new document
166	my $doc_obj = new doc ($filename, "indexed_doc");
167	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
168	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
169	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
170
171	# Calculate a "base" document ID.
172	if (!defined $id) {
173	$doc_obj->set_OID();
174	$id = $doc_obj->get_OID();
175	}
176
177	# include any metadata passed in from previous plugins
178	# note that this metadata is associated with the top level section
179	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
180
181	# do plugin specific processing of doc_obj
182	print $outhandle "segment $segment - ";
183	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
184	if (!defined $status) {
185	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
186	if ($verbosity >= 2);
187	next;
188	}
189	$count += $status;
190
191	# do any automatic metadata extraction
192	$self->auto_extract_metadata ($doc_obj);
193
194	# add an OID
195	$self->set_OID($doc_obj, $id, $segment);
196
197
198	# process the document
199	$processor->process($doc_obj);
200	}
201
202	# Return number of document objects produced
203	return $count;
204	}
205
206	sub set_OID {
207	my $self = shift (@_);
208	my ($doc_obj, $id, $segment_number) = @_;
209
210	$doc_obj->set_OID($id . "s" . $segment);
211
212	}
213
214	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: