source: trunk/gsdl/perllib/plugins/SplitPlug.pm@ 1676

Last change on this file since 1676 was 1676, checked in by paynter, 23 years ago

Plugins for processing files of bibliography records in BibTex and Refer
format. SplitPlug is a plugin for splitting one text file into many
Greenstone documents. ReferPlug and BibTextPlug (which both inherit from
SplitPlug) are for processing individual Refer and BibTex records
repectively.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.0 KB
RevLine 
[1676]1###########################################################################
2#
3# SplitPlug.pm - a plugin for splitting input files into segments that
4# will then be individually processed.
5#
6#
7# Copyright 2000 Gordon W. Paynter ([email protected])
8# Copyright 2000 The New Zealand Digital Library Project
9#
10# A component of the Greenstone digital library software
11# from the New Zealand Digital Library Project at the
12# University of Waikato, New Zealand.
13#
14# This program is free software; you can redistribute it and/or modify
15# it under the terms of the GNU General Public License as published by
16# the Free Software Foundation; either version 2 of the License, or
17# (at your option) any later version.
18#
19# This program is distributed in the hope that it will be useful,
20# but WITHOUT ANY WARRANTY; without even the implied warranty of
21# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22# GNU General Public License for more details.
23#
24# You should have received a copy of the GNU General Public License
25# along with this program; if not, write to the Free Software
26# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27#
28###########################################################################
29
30
31# SplitPlug is a plugin for splitting input files into segments that will
32# then be individually processed.
33
34# This plugin should not be called directly. Instead, if you need to
35# process input files that contain several documents, you should write a
36# plugin with a process function that will handle one of those documents
37# and have it inherit from SplitPlug. See ReferPlug for an example.
38
39
40package SplitPlug;
41
42use BasPlug;
43use util;
44
45
46# SplitPlug is a sub-class of BasPlug.
47sub BEGIN {
48 @ISA = ('BasPlug');
49}
50
51sub new {
52 my ($class) = @_;
53 $self = new BasPlug($class, @_);
54
55 if (!parsargv::parse(\@_,
56 q^split_exp/.*/^, \$self->{'split_exp'},
57 "allow_extra_options")) {
58 print STDERR "\nIncorrect options passed to $class.";
59 print STDERR "\nCheck your collect.cfg configuration file\n";
60 die "\n";
61 }
62
63 return bless $self, $class;
64}
65
66sub init {
67 my $self = shift (@_);
68 my ($verbosity, $outhandle) = @_;
69
70 $self->BasPlug::init($verbosity, $outhandle);
71
72 # set split_exp to default unless explicitly set
73 if (!$self->{'split_exp'}) {
74 $self->{'split_exp'} = $self->get_default_split_exp ();
75 }
76
77}
78
79# This plugin recurs over the segments it finds
80sub is_recursive {
81 return 1;
82}
83
84# By default, we split the input text at blank lines
85sub get_default_split_exp {
86 return q^\n\s*\n^;
87}
88
89
90# The read function opens a file and splits it into parts.
91# Each part is sent to the process function
92#
93# Returns: Number of document objects created (or undef if it fails)
94
95sub read {
96 my $self = shift (@_);
97 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
98 my $outhandle = $self->{'outhandle'};
99 my $verbosity = $self->{'verbosity'};
100
101 # Figure out the exact filename of this file (and maybe block it)
102 my $filename = &util::filename_cat($base_dir, $file);
103 my $block_exp = $self->{'block_exp'};
104 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
105 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
106 return undef;
107 }
108 my $plugin_name = ref ($self);
109 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
110
111 # Read in file ($text will be in utf8)
112 my $text = "";
113 $self->read_file ($filename, \$text);
114
115 if ($text !~ /\w/) {
116 my $outhandle = $self->{'outhandle'};
117 print $outhandle "$plugin_name: ERROR: $file contains no text\n"
118 if $self->{'verbosity'};
119 return 0;
120 }
121
122 # Split the text into several smaller segments
123 my $split_exp = $self->{'split_exp'};
124 my @segments = split(/$split_exp/, $text);
125 print $outhandle "SplitPlug found " . (scalar @segments) . " documents in $filename\n"
126 if $self->{'verbosity'};
127
128 # Process each segment in turn
129 my ($count, $segment, $segtext, $status, $id);
130 $segment = 0;
131 $count = 0;
132 foreach $segtext (@segments) {
133 $segment++;
134
135 # create a new document
136 my $doc_obj = new doc ($filename, "indexed_doc");
137
138 # Calculate a "base" document ID.
139 if (!defined $id) {
140 $doc_obj->set_OID();
141 $id = $doc_obj->get_OID();
142 }
143
144 # include any metadata passed in from previous plugins
145 # note that this metadata is associated with the top level section
146 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
147
148 # do plugin specific processing of doc_obj
149 print $outhandle "segment $segment - ";
150 $status = $self->process (\$segtext, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
151 if (!defined $status) {
152 print $outhandle "WARNING - no plugin could process segment $segment of $file\n"
153 if ($verbosity >= 2);
154 next;
155 }
156 $count += $status;
157
158 # do any automatic metadata extraction
159 $self->auto_extract_metadata ($doc_obj);
160
161 # add an OID
162 $doc_obj->set_OID($id . "s" . $segment);
163
164 # process the document
165 $processor->process($doc_obj);
166 }
167
168 # Return number of document objects produced
169 return $count;
170}
171
172
1731;
Note: See TracBrowser for help on using the repository browser.