source: gsdl/trunk/perllib/plugins/DBPlugin.pm@ 15880

Last change on this file since 15880 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1###########################################################################
2#
3# DBPlugin.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
36package DBPlugin;
37
38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
41use AutoExtractMetadata;
42use unicode;
43
44sub BEGIN {
45 @DBPlugin::ISA = ('AutoExtractMetadata');
46}
47
48my $arguments =
49 [ { 'name' => "process_exp",
50 'desc' => "{AutoExtractMetadata.process_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_process_exp(),
53 'reqd' => "no" }];
54
55my $options = { 'name' => "DBPlugin",
56 'desc' => "{DBPlugin.desc}",
57 'abstract' => "no",
58 'inherits' => "yes",
59 'args' => $arguments };
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)\.dbi$^;
78}
79
80sub read {
81 my $self = shift (@_);
82 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
83
84 #see if we can handle the passed file...
85 my ($block_status,$filename) = $self->read_block(@_);
86 return $block_status if ((!defined $block_status) || ($block_status==0));
87
88 my $outhandle = $self->{'outhandle'};
89 my $verbosity = $self->{'verbosity'};
90
91 print $outhandle "DBPlugin: processing $file\n"
92 if $self->{'verbosity'} > 1;
93
94 require DBI; # database independent stuff
95
96 # calculate the document hash, for document ids
97 my $hash="0";
98
99
100 # default options - may be overridden by config file
101 my $language=undef;
102 my $encoding=undef;
103 my $dbplug_debug=0;
104 my $username='';
105 my $password='';
106
107 # these settings must be set by the config file:
108 my $db=undef;
109
110 # get id of pages from "nonempty", get latest version number from
111 # "recent", and then get pagename from "page" and content from "version" !
112
113 my $sql_query_prime = undef ;
114 my $sql_query = undef ;
115
116 my %db_to_greenstone_fields=();
117 my %callbacks=();
118
119
120 # read in config file.
121 if (!open (CONF, $filename)) {
122 print $outhandle "DBPlugin: can't read $filename: $!\n";
123 return 0;
124 }
125
126 my $line;
127 my $statement="";
128 my $callback="";
129 while (defined($line=<CONF>)) {
130 chomp $line;
131 $line .= " "; # for multi-line statements - don't conjoin!
132 $line =~ s/\s*\#.*$//mg; # remove comments
133 $statement .= $line;
134
135 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
136 $callback .= $statement ; $statement = "";
137 # try to check that the function is "safe"
138 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
139 $callback =~ /[\`]|\|\-/) {
140 # no backticks or functions that start new processes allowed
141 print $outhandle "DBPlugin: bad function in callback\n";
142 return 0;
143 }
144 $callback =~ s/sub (\w+?)_callback/sub/;
145 my $fieldname = $1;
146 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
147 if (!defined($ret)) {
148 print $outhandle "DBPlugin: error eval'ing callback: $@\n";
149 exit(1);
150 }
151 $callback="";
152 print $outhandle "DBPlugin: callback registered for '$fieldname'\n"
153 if $dbplug_debug;
154 } elsif ($callback) {
155 # add this line to the callback function
156 $callback .= $statement;
157 $statement = "";
158 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
159 # check that it is safe
160 # assignment
161 if ($statement =~ m~(\$\w+)\s* = \s*
162 (\d # digits
163 | ".*?(?<!\\)" # " up to the next " not preceded by a \
164 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
165 )\s*;~x || # /x means ignore comments and whitespace in rx
166 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
167 # evaluate the assignment, return 1 on success "
168 if (!eval "$1=$2; 1") {
169 my $err=$@;
170 chomp $err;
171 $err =~ s/\.$//; # remove a trailing .
172 print $outhandle "DBPlugin: error evaluating `$statement'\n";
173 print $outhandle " $err (in $filename)\n";
174 return 0; # there was an error reading the config file
175 }
176 } elsif ($statement =~ /sub \w+_callback/) {
177 # this is the start of a callback function definition
178 $callback = $statement;
179 $statement = "";
180 } else {
181 print $outhandle "DBPlugin: skipping statement `$statement'\n";
182 }
183 $statement = "";
184 }
185 }
186 close CONF;
187
188
189 if (!defined($db)) {
190 print $outhandle "DBPlugin: error: $filename does not specify a db!\n";
191 return 0;
192 }
193 if (!defined($sql_query)) {
194 print $outhandle "DBPlugin: error: no SQL query specified!\n";
195 return 0;
196 }
197 # connect to database
198 my $dbhandle=DBI->connect($db, $username, $password);
199
200 if (!defined($dbhandle)) {
201 die "DBPlugin: could not connect to database, exiting.\n";
202 }
203 if (defined($dbplug_debug) && $dbplug_debug==1) {
204 print $outhandle "DBPlugin (debug): connected ok\n";
205 }
206
207 my $statement_hand;
208
209 # The user gave 2 sql statements to execute?
210 if ($sql_query_prime) {
211 $statement_hand=$dbhandle->prepare($sql_query_prime);
212 $statement_hand->execute;
213 if ($statement_hand->err) {
214 print $outhandle "Error: " . $statement_hand->errstr . "\n";
215 return undef;
216 }
217 }
218
219
220 $statement_hand=$dbhandle->prepare($sql_query);
221 $statement_hand->execute;
222 if ($statement_hand->err) {
223 print $outhandle "Error:" . $statement_hand->errstr . "\n";
224 return undef;
225 }
226
227 # get the array-ref for the field names and cast it to array
228 my @field_names;
229 @field_names=@{ $statement_hand->{NAME} };
230
231 foreach my $fieldname (@field_names) {
232 if (defined($db_to_greenstone_fields{$fieldname})) {
233 if (defined($dbplug_debug) && $dbplug_debug==1) {
234 print $outhandle "DBPlugin (debug): mapping db field "
235 . "'$fieldname' to "
236 . $db_to_greenstone_fields{$fieldname} . "\n";
237 }
238 $fieldname=$db_to_greenstone_fields{$fieldname};
239 }
240 }
241
242 # get rows
243
244 my $count = 0;
245 my @row_array;
246
247 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
248
249 while (scalar(@row_array)) {
250 if (defined($dbplug_debug) && $dbplug_debug==1) {
251 print $outhandle "DBPlugin (debug): retrieved a row from query\n";
252 }
253
254 # create a new document
255 my $doc_obj = new doc ($filename, "indexed_doc");
256 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
257 my $cursection = $doc_obj->get_top_section();
258
259 # if $language not set in config file, will use BasePlugin's default
260 if (defined($language)) {
261 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
262 }
263 # if $encoding not set in config file, will use BasePlugin's default
264 if (defined($encoding)) {
265 # allow some common aliases
266 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
267 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
268 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
269 }
270 $self->set_Source_metadata($doc_obj, $db, $encoding);
271
272 if ($self->{'cover_image'}) {
273 $self->associate_cover_image($doc_obj, $filename);
274 }
275 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
276
277 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
278
279 # include any metadata passed in from previous plugins
280 # note that this metadata is associated with the top level section
281 $self->extra_metadata ($doc_obj, $cursection,
282 $metadata);
283
284 # do any automatic metadata extraction
285 $self->auto_extract_metadata ($doc_obj);
286
287 my $unique_id=undef;
288
289 foreach my $fieldname (@field_names) {
290 my $fielddata=shift @row_array;
291
292 if (! defined($fielddata) ) {
293 next; # this field was "" or NULL
294 }
295 # use the specified encoding, defaulting to utf-8
296 if (defined($encoding) && $encoding ne "ascii"
297 && $encoding ne "utf8") {
298 $fielddata=&unicode::unicode2utf8(
299 &unicode::convert2unicode($encoding, \$fielddata)
300 );
301 }
302 # see if we have a ****_callback() function defined
303 if (exists $callbacks{$fieldname}) {
304 my $funcptr = $callbacks{$fieldname};
305 $fielddata = &$funcptr($fielddata);
306 }
307
308 if ($fieldname eq "text") {
309 # add as document text
310 $fielddata=~s@<@&lt;@g;
311 $fielddata=~s@>@&gt;@g; # for xml protection...
312 $fielddata=~s@_@\\_@g; # for macro language protection...
313 $doc_obj->add_utf8_text($cursection, $fielddata);
314 } elsif ($fieldname eq "Identifier") {
315 # use as greenstone's unique record id
316 if ($fielddata =~ /^\d+$/) {
317 # don't allow IDs that are completely numeric
318 $unique_id="id" . $fielddata;
319 } else {
320 $unique_id=$fielddata;
321 }
322 } else {
323 # add as document metadata
324 $fielddata=~s/\[/&#91;/g;
325 $fielddata=~s/\]/&#93;/g;
326 $doc_obj->add_utf8_metadata($cursection,
327 $fieldname, $fielddata);
328
329 }
330 }
331
332
333 if (!defined $unique_id) {
334 $doc_obj->set_OID();
335 my $id = $doc_obj->get_OID();
336 $doc_obj->set_OID($id."s$count");
337 } else {
338 # use our id from the database...
339 $doc_obj->set_OID($unique_id);
340 }
341
342
343 # process the document
344 $processor->process($doc_obj);
345
346 $count++;
347
348 # get next row
349 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
350 } # end of row_array is not empty
351
352 # check "$sth->err" if empty array for error
353 if ($statement_hand->err) {
354 print $outhandle "DBPlugin: received error: \"" .
355 $statement_hand->errstr . "\"\n";
356 }
357
358 # clean up connection to database
359 $statement_hand->finish();
360 $dbhandle->disconnect();
361
362 # num of input files, rather than documents created?
363 $self->{'num_processed'}++;
364
365 if (defined($dbplug_debug) && $dbplug_debug==1) {
366 print $outhandle "DBPlugin: imported $count DB records as documents.\n";
367 }
368 $count;
369}
370
3711;
Note: See TracBrowser for help on using the repository browser.