source: trunk/gsdl/perllib/plugins/DBPlug.pm@ 10956

Last change on this file since 10956 was 10956, checked in by jrm21, 18 years ago

now catch and exit if we got an error while parsing/evaling any
metadata callback function.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1###########################################################################
2#
3# DBPlug.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# See <GSDLHOME>/etc/packages/example.dbi for an example config file!!
29#
30
31# Written by John McPherson for the NZDL project
32# Mar, Apr 2003
33
34package DBPlug;
35
36use strict;
37no strict 'refs'; # allow variable as a filehandle
38
39use BasPlug;
40use unicode;
41
42use DBI; # database independent stuff
43
44sub BEGIN {
45 @DBPlug::ISA = ('BasPlug');
46}
47
48my $arguments =
49 [ { 'name' => "process_exp",
50 'desc' => "{BasPlug.process_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_process_exp(),
53 'reqd' => "no" }];
54
55my $options = { 'name' => "DBPlug",
56 'desc' => "{DBPlug.desc}",
57 'abstract' => "no",
58 'inherits' => "yes",
59 'args' => $arguments };
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
67 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
68
69 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)\.dbi$^;
78}
79# we don't have a per-greenstone document process() function!
80sub process {
81
82}
83
84
85sub read {
86 my $self = shift (@_);
87 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
88
89 # see if we can handle the passed file...
90 my $filename = $file;
91 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
92 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
93 # this plugin can't process this file type...
94 return undef;
95 }
96
97 my $outhandle = $self->{'outhandle'};
98 my $verbosity = $self->{'verbosity'};
99
100 print $outhandle "DBPlug: processing $file\n"
101 if $self->{'verbosity'} > 1;
102
103 # calculate the document hash, for document ids
104 my $hash="";
105
106 my $osexe = &util::get_os_exe();
107 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
108 $ENV{'GSDLOS'},"hashfile$osexe");
109 if (-e "$hashfile_exe") {
110 $hash = `hashfile$osexe \"$filename\"`;
111 $hash =~ /:\s*([0-9a-f]+)/i;
112 $hash="HASH$1";
113 }
114
115
116 # default options - may be overridden by config file
117 my $language=undef;
118 my $encoding=undef;
119 my $dbplug_debug=0;
120 my $username='';
121 my $password='';
122
123 # these settings must be set by the config file:
124 my $db=undef;
125
126# get id of pages from "nonempty", get latest version number from "recent", and
127# then get pagename from "page" and content from "version" !
128
129 my $sql_query_prime = undef ;
130 my $sql_query = undef ;
131
132 my %db_to_greenstone_fields=();
133 my %callbacks=();
134
135 # read in config file.
136 if (!open (CONF, $filename)) {
137 print $outhandle "DBPlug: can't read $filename: $!\n";
138 return 0;
139 }
140 my $line;
141 my $statement="";
142 my $callback="";
143 while (defined($line=<CONF>)) {
144 chomp $line;
145 $line .= " "; # for multi-line statements - don't conjoin!
146 $line =~ s/\s*\#.*$//mg; # remove comments
147 $statement .= $line;
148
149 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
150 $callback .= $statement ; $statement = "";
151 # try to check that the function is "safe"
152 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
153 $callback =~ /[\`]|\|\-/) {
154 # no backticks or functions that start new processes allowed
155 print $outhandle "DBPlug: bad function in callback\n";
156 return 0;
157 }
158 $callback =~ s/sub (\w+?)_callback/sub/;
159 my $fieldname = $1;
160 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
161 if (!defined($ret)) {
162 print $outhandle "DBPlug: error eval'ing callback: $@\n";
163 exit(1);
164 }
165 $callback="";
166 print $outhandle "DBPlug: callback registered for '$fieldname'\n"
167 if $dbplug_debug;
168 } elsif ($callback) {
169 # add this line to the callback function
170 $callback .= $statement;
171 $statement = "";
172 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
173 # check that it is safe
174 # assignment
175 if ($statement =~ m~(\$\w+)\s* = \s*
176 (\d # digits
177 | ".*?(?<!\\)" # " up to the next " not preceded by a \
178 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
179 )\s*;~x || # /x means ignore comments and whitespace in rx
180 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
181 # evaluate the assignment, return 1 on success "
182 if (!eval "$1=$2; 1") {
183 my $err=$@;
184 chomp $err;
185 $err =~ s/\.$//; # remove a trailing .
186 print $outhandle "DBPlug: error evaluating `$statement'\n";
187 print $outhandle " $err (in $filename)\n";
188 return 0; # there was an error reading the config file
189 }
190 } elsif ($statement =~ /sub \w+_callback/) {
191 # this is the start of a callback function definition
192 $callback = $statement;
193 $statement = "";
194 } else {
195 print $outhandle "DBPlug: skipping statement `$statement'\n";
196 }
197 $statement = "";
198 }
199 }
200 close CONF;
201
202 if (!defined($db)) {
203 print $outhandle "DBPlug: error: $filename does not specify a db!\n";
204 return 0;
205 }
206 if (!defined($sql_query)) {
207 print $outhandle "DBPlug: error: no SQL query specified!\n";
208 return 0;
209 }
210 # connect to database
211 my $dbhandle=DBI->connect($db, $username, $password);
212
213 if (!defined($dbhandle)) {
214 die "DBPlug: could not connect to database, exiting.\n";
215 }
216 if (defined($dbplug_debug) && $dbplug_debug==1) {
217 print $outhandle "DBPlug (debug): connected ok\n";
218 }
219
220 my $statement_hand;
221
222 # The user gave 2 sql statements to execute?
223 if ($sql_query_prime) {
224 $statement_hand=$dbhandle->prepare($sql_query_prime);
225 $statement_hand->execute;
226 if ($statement_hand->err) {
227 print $outhandle "Error: " . $statement_hand->errstr . "\n";
228 return undef;
229 }
230 }
231
232 $statement_hand=$dbhandle->prepare($sql_query);
233 $statement_hand->execute;
234 if ($statement_hand->err) {
235 print $outhandle "Error: " . $statement_hand->errstr . "\n";
236 return undef;
237 }
238
239 # get the array-ref for the field names and cast it to array
240 my @field_names;
241 @field_names=@{ $statement_hand->{NAME} };
242
243 foreach my $fieldname (@field_names) {
244 if (defined($db_to_greenstone_fields{$fieldname})) {
245 if (defined($dbplug_debug) && $dbplug_debug==1) {
246 print $outhandle "DBPlug (debug): mapping db field "
247 . "'$fieldname' to "
248 . $db_to_greenstone_fields{$fieldname} . "\n";
249 }
250 $fieldname=$db_to_greenstone_fields{$fieldname};
251 }
252 }
253
254 # get rows
255
256 my $count = 0;
257 my @row_array;
258
259 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
260
261 while (scalar(@row_array)) {
262 if (defined($dbplug_debug) && $dbplug_debug==1) {
263 print $outhandle "DBPlug (debug): retrieved a row from query\n";
264 }
265
266 # create a new document
267 my $doc_obj = new doc ($filename, "indexed_doc");
268 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
269 my $cursection = $doc_obj->get_top_section();
270
271 # if $language not set in config file, will use BasPlug's default
272 if (defined($language)) {
273 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
274 }
275 # if $encoding not set in config file, will use BasPlug's default
276 if (defined($encoding)) {
277 # allow some common aliases
278 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
279 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
280 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
281 }
282 $doc_obj->add_utf8_metadata($cursection,
283 "Source", &ghtml::dmsafe($db));
284 if ($self->{'cover_image'}) {
285 $self->associate_cover_image($doc_obj, $filename);
286 }
287 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
288
289 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
290
291 # include any metadata passed in from previous plugins
292 # note that this metadata is associated with the top level section
293 $self->extra_metadata ($doc_obj, $cursection,
294 $metadata);
295
296 # do any automatic metadata extraction
297 $self->auto_extract_metadata ($doc_obj);
298
299 my $unique_id=undef;
300
301 foreach my $fieldname (@field_names) {
302 my $fielddata=shift @row_array;
303
304 if (! defined($fielddata) ) {
305 next; # this field was "" or NULL
306 }
307 # use the specified encoding, defaulting to utf-8
308 if (defined($encoding) && $encoding ne "ascii"
309 && $encoding ne "utf8") {
310 $fielddata=&unicode::unicode2utf8(
311 &unicode::convert2unicode($encoding, \$fielddata)
312 );
313 }
314 # see if we have a ****_callback() function defined
315 if (exists $callbacks{$fieldname}) {
316 my $funcptr = $callbacks{$fieldname};
317 $fielddata = &$funcptr($fielddata);
318 }
319
320 if ($fieldname eq "text") {
321 # add as document text
322 $fielddata=~s@<@&lt;@g;
323 $fielddata=~s@>@&gt;@g; # for xml protection...
324 $fielddata=~s@_@\\_@g; # for macro language protection...
325 $doc_obj->add_utf8_text($cursection, $fielddata);
326 } elsif ($fieldname eq "Identifier") {
327 # use as greenstone's unique record id
328 if ($fielddata =~ /^\d+$/) {
329 # don't allow IDs that are completely numeric
330 $unique_id="id" . $fielddata;
331 } else {
332 $unique_id=$fielddata;
333 }
334 } else {
335 # add as document metadata
336 $fielddata=~s/\[/&#91;/g;
337 $fielddata=~s/\]/&#93;/g;
338 $doc_obj->add_utf8_metadata($cursection,
339 $fieldname, $fielddata);
340
341 }
342 }
343
344 if (!defined $unique_id) {
345 $doc_obj->set_OID($hash . "s$count");
346 } else {
347 # use our id from the database...
348 $doc_obj->set_OID($unique_id);
349 }
350
351
352 # process the document
353 $processor->process($doc_obj);
354
355
356 $count++;
357
358 # get next row
359 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
360 } # end of row_array is not empty
361
362 # check "$sth->err" if empty array for error
363 if ($statement_hand->err) {
364 print $outhandle "DBPlug: received error: \"" .
365 $statement_hand->errstr . "\"\n";
366 }
367
368 # clean up connection to database
369 $statement_hand->finish();
370 $dbhandle->disconnect();
371
372 # num of input files, rather than documents created?
373 $self->{'num_processed'}++;
374
375 if (defined($dbplug_debug) && $dbplug_debug==1) {
376 print $outhandle "DBPlug: imported $count DB records as documents.\n";
377 }
378 $count;
379}
380
3811;
Note: See TracBrowser for help on using the repository browser.