root/gsdl/trunk/perllib/plugins/SplitTextFile.pm @ 19493

Revision 19493, 9.5 KB (checked in by davidb, 11 years ago)

Introduction of new extrametafile to track which metadata.xml file a piece of metadata came from

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# SplitTextFile.pm - a plugin for splitting input files into segments that
4#                will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter (gwp@cs.waikato.ac.nz)
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitTextFile is a plugin for splitting input files into segments that will
32# then be individually processed. 
33
34# This plugin should not be called directly.  Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitTextFile.  See ReferPlug for an example.
38
39
40package SplitTextFile;
41
42use ReadTextFile;
43use gsprintf 'gsprintf';
44use util;
45
46use strict;
47no strict 'refs'; # allow filehandles to be variables and viceversa
48
49# SplitTextFile is a sub-class of ReadTextFile
50sub BEGIN {
51    @SplitTextFile::ISA = ('ReadTextFile');
52}
53
54
55my $arguments =
56    [ { 'name' => "split_exp",
57    'desc' => "{SplitTextFile.split_exp}",
58    'type' => "regexp",
59    #'deft' => &get_default_split_exp(),
60    'deft' => "",
61    'reqd' => "no" } ];
62
63my $options = { 'name'     => "SplitTextFile",
64        'desc'     => "{SplitTextFile.desc}",
65        'abstract' => "yes",
66        'inherits' => "yes",
67            'args'     => $arguments };
68
69
70sub new {
71    my ($class) = shift (@_);
72    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73    push(@$pluginlist, $class);
74
75    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76    push(@{$hashArgOptLists->{"OptList"}},$options);
77
78    my $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
79
80    $self->{'textcat_store'} = {};
81    $self->{'metapass_srcdoc'} = {}; # which segments have valid metadata_srcdoc
82    return bless $self, $class;
83}
84
85sub init {
86    my $self = shift (@_);
87    my ($verbosity, $outhandle, $failhandle) = @_;
88
89    $self->ReadTextFile::init($verbosity, $outhandle, $failhandle);
90
91    # why is this is init and not in new??
92    if ((!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
93
94    $self->{'process_exp'} = $self->get_default_process_exp ();
95    if ($self->{'process_exp'} eq "") {
96        warn ref($self) . " Warning: plugin has no process_exp\n";
97    }
98    }
99
100
101    # set split_exp to default unless explicitly set
102    if (!$self->{'split_exp'}) {
103    $self->{'split_exp'} = $self->get_default_split_exp ();
104    }
105
106}
107
108# This plugin recurs over the segments it finds
109sub is_recursive {
110    return 1;
111}
112
113# By default, we split the input text at blank lines
114sub get_default_split_exp {
115    return q^\n\s*\n^;
116}
117
118sub metadata_read {
119    my $self = shift (@_); 
120    my ($pluginfo, $base_dir, $file, $block_hash,
121    $extrametakeys, $extrametadata, $extrametafile,
122    $processor, $maxdocs, $gli) = @_;
123
124    # returns 1 if matches process_exp, and has done blocking in the meantime
125    my $matched = $self->SUPER::metadata_read($pluginfo, $base_dir, $file,
126                          $block_hash,
127                          $extrametakeys,
128                          $extrametadata,
129                          $extrametafile,
130                          $processor, $maxdocs, $gli);
131    my $split_matched = undef;
132
133    if ($matched) {
134
135    my $outhandle = $self->{'outhandle'};
136    my $filename = &util::filename_cat($base_dir, $file);
137
138    my $plugin_name = ref ($self);
139    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
140
141    $self->{'metapass_srcdoc'}->{$file} = {};
142
143    # Do encoding stuff
144    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
145    my $le_rec = { 'language' => $language, 'encoding' => $encoding };
146    $self->{'textcat_store'}->{$file} = $le_rec;
147
148    # Read in file ($text will be in utf8)
149    my $text = "";
150    $self->read_file ($filename, $encoding, $language, \$text);
151
152 
153    if ($text !~ /\w/) {
154        gsprintf($outhandle, "$plugin_name: {ReadTextFile.file_has_no_text}\n",
155             $file)
156        if $self->{'verbosity'};
157       
158        my $failhandle = $self->{'failhandle'};
159        print $failhandle "$file: " . ref($self) . ": file contains no text\n";
160        $self->{'num_not_processed'} ++;
161
162        $self->{'textcat_store'}->{$file} = undef;
163
164        return 0;
165    }
166   
167   
168    # Split the text into several smaller segments
169    my $split_exp = $self->{'split_exp'};
170        my @tmp  = split(/$split_exp/i, $text);
171    my @segments =();
172    ## get rid of empty segments
173    foreach my $seg (@tmp){
174        if ($seg ne ""){
175        push @segments, $seg;
176        }
177    }
178
179    print $outhandle "SplitTextFile found " . (scalar @segments) . " documents in $filename\n"
180        if $self->{'verbosity'};
181   
182    $self->{'split_segments'}->{$file} = \@segments;
183    $split_matched = scalar(@segments);
184    }
185   
186    return $split_matched;
187}
188
189
190
191# The read function opens a file and splits it into parts.
192# Each part is sent to the process function
193#
194# Returns: Number of document objects created (or undef if it fails)
195
196sub read {
197    my $self = shift (@_);
198    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
199    my $outhandle = $self->{'outhandle'};
200    my $verbosity = $self->{'verbosity'};
201
202    # can we process this file??
203    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
204    return undef unless $self->can_process_this_file($filename_full_path);
205
206    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
207
208    my $le_rec = $self->{'textcat_store'}->{$file};
209    if (!defined $le_rec) {
210    # means no text was found;
211    return 0; # not processed but no point in passing it on
212    }
213
214    print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli);
215    print $outhandle "$self->{'plugin_type'} processing $file\n"
216        if $self->{'verbosity'} > 1;   
217
218    my $language = $le_rec->{'language'};
219    my $encoding = $le_rec->{'encoding'};
220    $self->{'textcat_store'}->{$file} = undef;
221
222    my $segments = $self->{'split_segments'}->{$file};
223    $self->{'split_segments'}->{$file} = undef;
224
225    # Process each segment in turn
226    my ($count, $segment, $segtext, $status, $id);
227    $segment = 0;
228    $count = 0;
229    foreach $segtext (@$segments) {
230        $segment++;
231
232    if (defined $self->{'metapass_srcdoc'}->{$file}->{$segment}) {
233        # metadata is attached to a srcdoc
234        next;
235    }
236
237    # create a new document
238    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
239    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
240    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
241    my ($filemeta) = $file =~ /([^\\\/]+)$/;
242    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
243    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
244    if ($self->{'cover_image'}) {
245        $self->associate_cover_image($doc_obj, $filename_full_path);
246    }
247    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
248    #$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "Split");
249
250    # Calculate a "base" document ID.
251    if (!defined $id) {
252        $id = $self->get_base_OID($doc_obj);
253    }
254   
255    # include any metadata passed in from previous plugins
256    # note that this metadata is associated with the top level section
257    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
258
259    # do plugin specific processing of doc_obj
260    print $outhandle "segment $segment\n" if ($self->{'verbosity'});
261    $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli);
262    if (!defined $status) {
263        print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
264        if ($verbosity >= 2);
265        next;
266    }
267    # If the plugin returned 0, it threw away this part
268    if ($status == 0) {
269        next;
270    }
271    $count += $status;
272
273    # do any automatic metadata extraction
274    $self->auto_extract_metadata ($doc_obj);
275
276    # add an OID
277    $self->add_OID($doc_obj, $id, $segment);
278
279    # process the document
280    $processor->process($doc_obj);
281
282    $self->{'num_processed'} ++;
283    }
284
285    delete $self->{'metapass_srcdoc'}->{$file};
286
287    # Return number of document objects produced
288    return $count;
289}
290
291sub get_base_OID {
292    my $self = shift(@_);
293    my ($doc_obj) = @_;
294
295    $self->SUPER::add_OID($doc_obj);
296    return $doc_obj->get_OID();
297}
298
299sub add_OID {
300    my $self = shift (@_);
301    my ($doc_obj, $id, $segment) = @_;
302
303    my $full_id = $id . "s" . $segment;
304    if ($self->{'OIDtype'} eq "assigned") {
305    my $identifier = $doc_obj->get_metadata_element ($doc_obj->get_top_section(), $self->{'OIDmetadata'});
306    if (defined $identifier && $identifier ne "") {
307        $full_id = $identifier;
308        $full_id =~ s/\.//g; #remove any periods
309        if ($full_id =~ /^[\d]*$/) {
310        $full_id = "D" . $full_id;
311        print STDERR "OID only contains numbers, adding a D\n";
312        }
313    }
314    }
315    $doc_obj->set_OID($full_id);
316}
317
318
3191;
Note: See TracBrowser for help on using the browser.