source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 1894

Last change on this file since 1894 was 1894, checked in by jrm21, 23 years ago

updated by copying BasPlug's new language/encoding stuff over for the read()
stuff.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.0 KB
Line 
1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use util;
44
45
46# SplitPlug is a sub-class of BasPlug.
47sub BEGIN {
48 @ISA = ('BasPlug');
49}
50
51sub new {
52 my ($class) = @_;
53 $self = new BasPlug($class, @_);
54
55 if (!parsargv::parse(\@_,
56 q^split_exp/.*/^, \$self->{'split_exp'},
57 "allow_extra_options")) {
58 print STDERR "\nIncorrect options passed to $class.";
59 print STDERR "\nCheck your collect.cfg configuration file\n";
60 die "\n";
61 }
62
63 return bless $self, $class;
64}
65
66sub init {
67 my $self = shift (@_);
68 my ($verbosity, $outhandle) = @_;
69
70 $self->BasPlug::init($verbosity, $outhandle);
71
72 # set split_exp to default unless explicitly set
73 if (!$self->{'split_exp'}) {
74 $self->{'split_exp'} = $self->get_default_split_exp ();
75 }
76
77}
78
79# This plugin recurs over the segments it finds
80sub is_recursive {
81 return 1;
82}
83
84# By default, we split the input text at blank lines
85sub get_default_split_exp {
86 return q^\n\s*\n^;
87}
88
89
90# The read function opens a file and splits it into parts.
91# Each part is sent to the process function
92#
93# Returns: Number of document objects created (or undef if it fails)
94
95sub read {
96 my $self = shift (@_);
97 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
98 my $outhandle = $self->{'outhandle'};
99 my $verbosity = $self->{'verbosity'};
100
101 # Figure out the exact filename of this file (and maybe block it)
102 my $filename = &util::filename_cat($base_dir, $file);
103 my $block_exp = $self->{'block_exp'};
104 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
105 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
106 return undef;
107 }
108 my $plugin_name = ref ($self);
109 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
110
111 my ($language, $encoding);
112 if ($self->{'input_encoding'} eq "auto") {
113 # use textcat to automatically work out the input encoding and language
114 ($language, $encoding) = $self->get_language_encoding ($filename);
115
116 } elsif ($self->{'extract_language'}) {
117 # use textcat to get language metadata
118 ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
119 $encoding = $self->{'input_encoding'};
120
121 if ($extracted_encoding ne $encoding && $self->{'verbosity'}) {
122 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but ";
123 print $outhandle "appears to be encoded as $extracted_encoding.\n";
124 }
125
126 } else {
127 $language = $self->{'default_language'};
128 $encoding = $self->{'input_encoding'};
129 }
130
131 # Read in file ($text will be in utf8)
132 my $text = "";
133 $self->read_file ($filename, $encoding, \$text);
134
135 if ($text !~ /\w/) {
136 my $outhandle = $self->{'outhandle'};
137 print $outhandle "$plugin_name: ERROR: $file contains no text\n"
138 if $self->{'verbosity'};
139 return 0;
140 }
141
142
143 # Split the text into several smaller segments
144 my $split_exp = $self->{'split_exp'};
145 my @segments = split(/$split_exp/, $text);
146 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
147 if $self->{'verbosity'};
148
149 # Process each segment in turn
150 my ($count, $segment, $segtext, $status, $id);
151 $segment = 0;
152 $count = 0;
153 foreach $segtext (@segments) {
154 $segment++;
155
156 # create a new document
157 my $doc_obj = new doc ($filename, "indexed_doc");
158 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
159 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
160
161 # Calculate a "base" document ID.
162 if (!defined $id) {
163 $doc_obj->set_OID();
164 $id = $doc_obj->get_OID();
165 }
166
167 # include any metadata passed in from previous plugins
168 # note that this metadata is associated with the top level section
169 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
170
171 # do plugin specific processing of doc_obj
172 print $outhandle "segment $segment - ";
173 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
174 if (!defined $status) {
175 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
176 if ($verbosity >= 2);
177 next;
178 }
179 $count += $status;
180
181 # do any automatic metadata extraction
182 $self->auto_extract_metadata ($doc_obj);
183
184 # add an OID
185 $doc_obj->set_OID($id . "s" . $segment);
186
187 # process the document
188 $processor->process($doc_obj);
189 }
190
191 # Return number of document objects produced
192 return $count;
193}
194
195
1961;
Note: See TracBrowser for help on using the repository browser.