Context Navigation

source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 2492

Last change on this file since 2492 was 2492, checked in by paynter, 23 years ago
Fixed trivial bug in the new set_OID function.
Property svn:keywords set to `Author Date Id Revision`
File size: 6.4 KB

Line
1	###########################################################################
2	#
3	# SplitPlug.pm - a plugin for splitting input files into segments that
4	# will then be individually processed.
5	#
6	#
7	# Copyright 2000 Gordon W. Paynter ([email protected])
8	# Copyright 2000 The New Zealand Digital Library Project
9	#
10	# A component of the Greenstone digital library software
11	# from the New Zealand Digital Library Project at the
12	# University of Waikato, New Zealand.
13	#
14	# This program is free software; you can redistribute it and/or modify
15	# it under the terms of the GNU General Public License as published by
16	# the Free Software Foundation; either version 2 of the License, or
17	# (at your option) any later version.
18	#
19	# This program is distributed in the hope that it will be useful,
20	# but WITHOUT ANY WARRANTY; without even the implied warranty of
21	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	# GNU General Public License for more details.
23	#
24	# You should have received a copy of the GNU General Public License
25	# along with this program; if not, write to the Free Software
26	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	#
28	###########################################################################
29
30
31	# SplitPlug is a plugin for splitting input files into segments that will
32	# then be individually processed.
33
34	# This plugin should not be called directly. Instead, if you need to
35	# process input files that contain several documents, you should write a
36	# plugin with a process function that will handle one of those documents
37	# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40	package SplitPlug;
41
42	use BasPlug;
43	use util;
44
45
46	# SplitPlug is a sub-class of BasPlug.
47	sub BEGIN {
48	@ISA = ('BasPlug');
49	}
50
51	sub new {
52	my ($class) = @_;
53	$self = new BasPlug($class, @_);
54
55	if (!parsargv::parse(\@_,
56	q^split_exp/.*/^, \$self->{'split_exp'},
57	"allow_extra_options")) {
58	print STDERR "\nIncorrect options passed to $class.";
59	print STDERR "\nCheck your collect.cfg configuration file\n";
60	die "\n";
61	}
62
63	return bless $self, $class;
64	}
65
66	sub init {
67	my $self = shift (@_);
68	my ($verbosity, $outhandle) = @_;
69
70	$self->BasPlug::init($verbosity, $outhandle);
71
72	if ((!defined $self->{'process_exp'}) \|\| ($self->{'process_exp'} eq "")) {
73
74	$self->{'process_exp'} = $self->get_default_process_exp ();
75	if ($self->{'process_exp'} eq "") {
76	warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
77	}
78	}
79
80
81	# set split_exp to default unless explicitly set
82	if (!$self->{'split_exp'}) {
83	$self->{'split_exp'} = $self->get_default_split_exp ();
84	}
85
86	}
87
88	# This plugin recurs over the segments it finds
89	sub is_recursive {
90	return 1;
91	}
92
93	# By default, we split the input text at blank lines
94	sub get_default_split_exp {
95	return q^\n\s*\n^;
96	}
97
98
99	# The read function opens a file and splits it into parts.
100	# Each part is sent to the process function
101	#
102	# Returns: Number of document objects created (or undef if it fails)
103
104	sub read {
105	my $self = shift (@_);
106	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
107	my $outhandle = $self->{'outhandle'};
108	my $verbosity = $self->{'verbosity'};
109
110	# Figure out the exact filename of this file (and maybe block it)
111	my $filename = &util::filename_cat($base_dir, $file);
112	my $block_exp = $self->{'block_exp'};
113	return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
114	if ($filename !~ /$self->{'process_exp'}/ \|\| !-f $filename) {
115	return undef;
116	}
117	my $plugin_name = ref ($self);
118	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
119
120	my ($language, $encoding);
121	if ($self->{'input_encoding'} eq "auto") {
122	# use textcat to automatically work out the input encoding and language
123	($language, $encoding) = $self->get_language_encoding ($filename);
124
125	} elsif ($self->{'extract_language'}) {
126	# use textcat to get language metadata
127	($language, $extracted_encoding) = $self->get_language_encoding ($filename);
128	$encoding = $self->{'input_encoding'};
129
130	if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
131	print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
132	print $outhandle "appears to be encoded as $extracted_encoding.\n";
133	}
134
135	} else {
136	$language = $self->{'default_language'};
137	$encoding = $self->{'input_encoding'};
138	}
139
140	# Read in file ($text will be in utf8)
141	my $text = "";
142	$self->read_file ($filename, $encoding, \$text);
143
144	if ($text !~ /\w/) {
145	my $outhandle = $self->{'outhandle'};
146	print $outhandle "$plugin_name: ERROR: $file contains no text\n"
147	if $self->{'verbosity'};
148	return 0;
149	}
150
151
152	# Split the text into several smaller segments
153	my $split_exp = $self->{'split_exp'};
154	my @segments = split(/$split_exp/, $text);
155	print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
156	if $self->{'verbosity'};
157
158	# Process each segment in turn
159	my ($count, $segment, $segtext, $status, $id);
160	$segment = 0;
161	$count = 0;
162	foreach $segtext (@segments) {
163	$segment++;
164
165	# create a new document
166	my $doc_obj = new doc ($filename, "indexed_doc");
167	$doc_obj->set_OIDtype ($processor->{'OIDtype'});
168	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
169	$doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
170
171	# Calculate a "base" document ID.
172	if (!defined $id) {
173	$doc_obj->set_OID();
174	$id = $doc_obj->get_OID();
175	}
176
177	# include any metadata passed in from previous plugins
178	# note that this metadata is associated with the top level section
179	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
180
181	# do plugin specific processing of doc_obj
182	print $outhandle "segment $segment - ";
183	$status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
184	if (!defined $status) {
185	print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
186	if ($verbosity >= 2);
187	next;
188	}
189	$count += $status;
190
191	# do any automatic metadata extraction
192	$self->auto_extract_metadata ($doc_obj);
193
194	# add an OID
195	$self->set_OID($doc_obj, $id, $segment);
196
197
198	# process the document
199	$processor->process($doc_obj);
200	}
201
202	# Return number of document objects produced
203	return $count;
204	}
205
206	sub set_OID {
207	my $self = shift (@_);
208	my ($doc_obj, $id, $segment_number) = @_;
209
210	$doc_obj->set_OID($id . "s" . $segment_number);
211	}
212
213	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: