source: trunk/gsdl/perllib/plugin.pm@ 6911

Last change on this file since 6911 was 6584, checked in by kjdon, 20 years ago

Fiddled around with segmenting for chinese text. Haven't changed how the
segmentation is done, or what character ranges are used.
But when its done is now controlled by the collect.cfg. There is a new
option, separate_cjk, values true or false, default false. Segmentation
is only done if this is set to true. This is passed as a global option to
all plugins by the import.pl script, so the user just needs to add it
once to the config file, not as an option to all plugins.
The queryaction uses this option too to determine whether or not to segment
the query.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1###########################################################################
2#
3# plugin.pm -- functions to handle using plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package plugin;
27
28require util;
29use gsprintf;
30
31my $stats = {'num_processed' => 0,
32 'num_blocked' => 0,
33 'num_not_processed' => 0,
34 'num_archives' => 0
35 };
36
37
38sub gsprintf
39{
40 return &gsprintf::gsprintf(@_);
41}
42
43#globaloptions contains any options that should be passed to all plugins
44sub load_plugins {
45 my ($plugin_list) = shift @_;
46 ($verbosity, $outhandle, $failhandle, $globaloptions) = @_; # globals
47 my @plugin_objects = ();
48
49 $verbosity = 2 unless defined $verbosity;
50 $outhandle = STDERR unless defined $outhandle;
51 $failhandle = STDERR unless defined $failhandle;
52
53 map { $_ = "\"$_\""; } @$globaloptions;
54 my $globals = join (",", @$globaloptions);
55
56 foreach $pluginoptions (@$plugin_list) {
57 my $pluginname = shift @$pluginoptions;
58 next unless defined $pluginname;
59
60 # find the plugin
61 my $colplugname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"perllib/plugins",
62 "${pluginname}.pm");
63 my $mainplugname = &util::filename_cat($ENV{'GSDLHOME'},"perllib/plugins",
64 "${pluginname}.pm");
65 if (-e $colplugname) { require $colplugname; }
66 elsif (-e $mainplugname) { require $mainplugname; }
67 else { &gsprintf(STDERR, "{plugin.could_not_find_plugin}\n", $pluginname) && die "\n";
68 # die "ERROR - couldn't find plugin \"$pluginname\"\n";
69 }
70
71 # create a plugin object
72 my ($plugobj);
73 map { $_ = "\"$_\""; } @$pluginoptions;
74 my $options = join (",", @$pluginoptions);
75 if ($globals) {
76 if (@$pluginoptions) {
77 $options .= ",";
78 }
79 $options .= "$globals";
80 }
81 $options =~ s/\$/\\\$/g;
82
83 eval ("\$plugobj = new \$pluginname($options)");
84 die "$@" if $@;
85
86 # initialize plugin
87 $plugobj->init($verbosity, $outhandle, $failhandle);
88
89 # add this object to the list
90 push (@plugin_objects, $plugobj);
91 }
92
93 return \@plugin_objects;
94}
95
96
97sub begin {
98 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
99
100 map { $_->begin($pluginfo, $base_dir, $processor, $maxdocs); } @$pluginfo;
101}
102
103sub read {
104 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli, $aux) = @_;
105
106 $maxdocs = -1 unless defined $maxdocs && $maxdocs =~ /\d/;
107 $gli = 0 unless defined $gli;
108
109 my $rv = 0;
110
111 # Announce to GLI that we are handling a file
112 print STDERR "<File n='$file'>\n" if $gli;
113
114 # the .kill file is a handy (if not very elegant) way of aborting
115 # an import.pl or buildcol.pl process
116 if (-e &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, ".kill")) {
117 &gsprintf($outhandle, "{plugin.kill_file}\n");
118 # print $outhandle "Process killed by .kill file\n";
119 die "\n";
120 }
121
122 # pass this file by each of the plugins in turn until one
123 # is found which will process it
124 foreach $plugobj (@$pluginfo) {
125 $rv = $plugobj->read($pluginfo, $base_dir, $file,
126 $metadata, $processor, $maxdocs, $gli, $aux);
127 return $rv if defined $rv;
128 }
129
130 if ($verbosity >= 2) {
131 &gsprintf($outhandle, "{plugin.no_plugin_could_process}\n", $file);
132 # print $outhandle "WARNING - no plugin could process $file\n";
133 }
134
135 $file =~ s/.*?([^\\\/]+)$/$1/;
136 &gsprintf($failhandle, "$file: {plugin.no_plugin_could_process_this_file}\n");
137 # print $failhandle "$file: no plugin could process this file\n";
138 $stats->{'num_not_processed'} ++;
139
140 return 0;
141}
142
143# write out some general stats that the plugins have compiled - note that
144# the buildcol.pl process doesn't currently call this process so the stats
145# are only output after import.pl -
146sub write_stats {
147 my ($pluginfo, $statshandle, $faillog, $gli) = @_;
148
149 $gli = 0 unless defined $gli;
150
151 foreach $plugobj (@$pluginfo) {
152 $plugobj->compile_stats($stats);
153 }
154
155 my $total = $stats->{'num_processed'} + $stats->{'num_blocked'} +
156 $stats->{'num_not_processed'};
157
158 print STDERR "<ImportComplete considered='$total' processed='$stats->{'num_processed'}' blocked='$stats->{'num_blocked'}' ignored='$stats->{'num_not_processed'}'>\n" if $gli;
159
160 if ($total == 1) {
161 &gsprintf($statshandle, "* {plugin.one_considered}\n");
162 # print $statshandle "* 1 document was considered for processing\n";
163 } else {
164 &gsprintf($statshandle, "* {plugin.n_considered}\n", $total);
165 # print $statshandle "* $total documents were considered for processing\n";
166 }
167 if ($stats->{'num_archives'}) {
168 # print $statshandle " (including the contents of " . $stats->{'num_archives'} .
169 # " ZIP/TAR archive";
170 if ($stats->{'num_archives'} == 1) {
171 &gsprintf($statshandle, " ({plugin.including_archive})\n");
172 # print $statshandle ")\n";}
173 }
174 else {
175 &gsprintf($statshandle, " ({plugin.including_archives})\n", $stats->{'num_archives'});
176 # print $statshandle "s)\n";}
177 }
178 }
179 if ($stats->{'num_processed'} == 1) {
180 &gsprintf($statshandle, "* {plugin.one_included}\n");
181 # print $statshandle "* 1 was processed and included in the collection\n";
182 } else {
183 &gsprintf($statshandle, "* {plugin.n_included}\n", $stats->{'num_processed'});
184 # print $statshandle "* " . $stats->{'num_processed'} . " were processed and included in the collection\n";
185 }
186 if ($stats->{'num_not_processed'}) {
187 if ($stats->{'num_not_processed'} == 1) {
188 &gsprintf($statshandle, "* {plugin.one_rejected}\n");
189 # print $statshandle "* 1 was rejected.";
190 } else {
191 &gsprintf($statshandle, "* {plugin.n_rejected}\n", $stats->{'num_not_processed'});
192 # print $statshandle "* " . $stats->{'num_not_processed'} . " were rejected.";
193 }
194 &gsprintf($statshandle, " {plugin.see_faillog}\n", $faillog);
195 # print $statshandle " See $faillog for a list of rejected documents\n";
196 }
197}
198
199sub end {
200 my ($pluginfo, $processor) = @_;
201 map { $_->end($processor); } @$pluginfo;
202}
203
2041;
Note: See TracBrowser for help on using the repository browser.