source: gsdl/trunk/perllib/plugins/DatabasePlugin.pm@ 17744

Last change on this file since 17744 was 17744, checked in by kjdon, 15 years ago

changed the string keys to reflect the name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
RevLine 
[4429]1###########################################################################
2#
[17720]3# DatabasePlugin.pm -- plugin to import records from a database
[4429]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[12741]27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
[4429]31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
[17720]36package DatabasePlugin;
[4429]37
[7683]38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
[15872]41use AutoExtractMetadata;
[4429]42use unicode;
43
44sub BEGIN {
[17720]45 @DatabasePlugin::ISA = ('AutoExtractMetadata');
[4429]46}
47
[4744]48my $arguments =
49 [ { 'name' => "process_exp",
[16019]50 'desc' => "{BasePlugin.process_exp}",
[6408]51 'type' => "regexp",
[4744]52 'deft' => &get_default_process_exp(),
53 'reqd' => "no" }];
[4429]54
[17720]55my $options = { 'name' => "DatabasePlugin",
[17744]56 'desc' => "{DatabasePlugin.desc}",
[6408]57 'abstract' => "no",
[4744]58 'inherits' => "yes",
59 'args' => $arguments };
[4429]60
61sub new {
[10218]62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
[4429]65
[15872]66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
[4429]68
[15872]69 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
[4429]70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)\.dbi$^;
78}
79
80sub read {
81 my $self = shift (@_);
[16392]82 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
[13315]83
84 #see if we can handle the passed file...
[16392]85 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
86 return undef unless $self->can_process_this_file($filename_full_path);
87
[4429]88 my $outhandle = $self->{'outhandle'};
89 my $verbosity = $self->{'verbosity'};
90
[17720]91 print STDERR "<Processing n='$file' p='DatabasePlugin'>\n" if ($gli);
92 print $outhandle "DatabasePlugin: processing $file\n"
[4429]93 if $self->{'verbosity'} > 1;
[13315]94
[12687]95 require DBI; # database independent stuff
96
[4429]97 # calculate the document hash, for document ids
[13315]98 my $hash="0";
[4429]99
[13315]100
[4429]101 # default options - may be overridden by config file
102 my $language=undef;
103 my $encoding=undef;
104 my $dbplug_debug=0;
105 my $username='';
106 my $password='';
107
108 # these settings must be set by the config file:
109 my $db=undef;
110
[15872]111 # get id of pages from "nonempty", get latest version number from
112 # "recent", and then get pagename from "page" and content from "version" !
[4429]113
[10923]114 my $sql_query_prime = undef ;
[4429]115 my $sql_query = undef ;
116
117 my %db_to_greenstone_fields=();
[8119]118 my %callbacks=();
[4429]119
[13315]120
[4429]121 # read in config file.
[16392]122 if (!open (CONF, $filename_full_path)) {
[17720]123 print $outhandle "DatabasePlugin: can't read $filename_full_path: $!\n";
[7681]124 return 0;
[13315]125 }
126
[7683]127 my $line;
128 my $statement="";
129 my $callback="";
130 while (defined($line=<CONF>)) {
131 chomp $line;
[10923]132 $line .= " "; # for multi-line statements - don't conjoin!
[8119]133 $line =~ s/\s*\#.*$//mg; # remove comments
[7683]134 $statement .= $line;
[8119]135
136 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
137 $callback .= $statement ; $statement = "";
138 # try to check that the function is "safe"
139 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
140 $callback =~ /[\`]|\|\-/) {
141 # no backticks or functions that start new processes allowed
[17720]142 print $outhandle "DatabasePlugin: bad function in callback\n";
[8119]143 return 0;
144 }
145 $callback =~ s/sub (\w+?)_callback/sub/;
[10956]146 my $fieldname = $1;
147 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
148 if (!defined($ret)) {
[17720]149 print $outhandle "DatabasePlugin: error eval'ing callback: $@\n";
[10956]150 exit(1);
151 }
[8119]152 $callback="";
[17720]153 print $outhandle "DatabasePlugin: callback registered for '$fieldname'\n"
[10956]154 if $dbplug_debug;
[8119]155 } elsif ($callback) {
156 # add this line to the callback function
157 $callback .= $statement;
158 $statement = "";
159 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
[7683]160 # check that it is safe
161 # assignment
[8119]162 if ($statement =~ m~(\$\w+)\s* = \s*
[7683]163 (\d # digits
164 | ".*?(?<!\\)" # " up to the next " not preceded by a \
165 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
[8119]166 )\s*;~x || # /x means ignore comments and whitespace in rx
167 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
168 # evaluate the assignment, return 1 on success "
169 if (!eval "$1=$2; 1") {
[7683]170 my $err=$@;
171 chomp $err;
172 $err =~ s/\.$//; # remove a trailing .
[17720]173 print $outhandle "DatabasePlugin: error evaluating `$statement'\n";
[16392]174 print $outhandle " $err (in $filename_full_path)\n";
[7683]175 return 0; # there was an error reading the config file
176 }
[8119]177 } elsif ($statement =~ /sub \w+_callback/) {
178 # this is the start of a callback function definition
[7683]179 $callback = $statement;
[8119]180 $statement = "";
181 } else {
[17720]182 print $outhandle "DatabasePlugin: skipping statement `$statement'\n";
[7683]183 }
184 $statement = "";
185 }
186 }
[7681]187 close CONF;
[4429]188
[13315]189
[4429]190 if (!defined($db)) {
[17720]191 print $outhandle "DatabasePlugin: error: $filename_full_path does not specify a db!\n";
[4429]192 return 0;
193 }
194 if (!defined($sql_query)) {
[17720]195 print $outhandle "DatabasePlugin: error: no SQL query specified!\n";
[4429]196 return 0;
197 }
198 # connect to database
199 my $dbhandle=DBI->connect($db, $username, $password);
200
201 if (!defined($dbhandle)) {
[17720]202 die "DatabasePlugin: could not connect to database, exiting.\n";
[4429]203 }
204 if (defined($dbplug_debug) && $dbplug_debug==1) {
[17720]205 print $outhandle "DatabasePlugin (debug): connected ok\n";
[4429]206 }
207
[10923]208 my $statement_hand;
209
210 # The user gave 2 sql statements to execute?
211 if ($sql_query_prime) {
212 $statement_hand=$dbhandle->prepare($sql_query_prime);
213 $statement_hand->execute;
214 if ($statement_hand->err) {
215 print $outhandle "Error: " . $statement_hand->errstr . "\n";
216 return undef;
217 }
218 }
219
[13315]220
[10923]221 $statement_hand=$dbhandle->prepare($sql_query);
[4429]222 $statement_hand->execute;
[10923]223 if ($statement_hand->err) {
[13315]224 print $outhandle "Error:" . $statement_hand->errstr . "\n";
[10923]225 return undef;
226 }
[4429]227
228 # get the array-ref for the field names and cast it to array
229 my @field_names;
230 @field_names=@{ $statement_hand->{NAME} };
231
232 foreach my $fieldname (@field_names) {
233 if (defined($db_to_greenstone_fields{$fieldname})) {
234 if (defined($dbplug_debug) && $dbplug_debug==1) {
[17720]235 print $outhandle "DatabasePlugin (debug): mapping db field "
[4429]236 . "'$fieldname' to "
237 . $db_to_greenstone_fields{$fieldname} . "\n";
238 }
239 $fieldname=$db_to_greenstone_fields{$fieldname};
240 }
241 }
242
243 # get rows
244
245 my $count = 0;
246 my @row_array;
247
248 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
[17026]249
250 my $base_oid = undef;
[4429]251 while (scalar(@row_array)) {
252 if (defined($dbplug_debug) && $dbplug_debug==1) {
[17720]253 print $outhandle "DatabasePlugin (debug): retrieved a row from query\n";
[4429]254 }
255
256 # create a new document
[16392]257 my $doc_obj = new doc ($filename_full_path, "indexed_doc");
[17026]258
[4429]259 my $cursection = $doc_obj->get_top_section();
260
[15872]261 # if $language not set in config file, will use BasePlugin's default
[4429]262 if (defined($language)) {
263 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
264 }
[15872]265 # if $encoding not set in config file, will use BasePlugin's default
[4429]266 if (defined($encoding)) {
[7701]267 # allow some common aliases
268 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
269 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
[4429]270 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
271 }
[15872]272 $self->set_Source_metadata($doc_obj, $db, $encoding);
273
[4429]274 if ($self->{'cover_image'}) {
[16392]275 $self->associate_cover_image($doc_obj, $filename_full_path);
[4429]276 }
[7508]277 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
[4429]278
[8121]279 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
[5919]280
[4429]281 # include any metadata passed in from previous plugins
282 # note that this metadata is associated with the top level section
283 $self->extra_metadata ($doc_obj, $cursection,
284 $metadata);
285
286 # do any automatic metadata extraction
287 $self->auto_extract_metadata ($doc_obj);
288
289 my $unique_id=undef;
290
291 foreach my $fieldname (@field_names) {
292 my $fielddata=shift @row_array;
[10923]293
294 if (! defined($fielddata) ) {
295 next; # this field was "" or NULL
296 }
[4429]297 # use the specified encoding, defaulting to utf-8
298 if (defined($encoding) && $encoding ne "ascii"
299 && $encoding ne "utf8") {
300 $fielddata=&unicode::unicode2utf8(
301 &unicode::convert2unicode($encoding, \$fielddata)
302 );
303 }
[8119]304 # see if we have a ****_callback() function defined
305 if (exists $callbacks{$fieldname}) {
306 my $funcptr = $callbacks{$fieldname};
307 $fielddata = &$funcptr($fielddata);
308 }
309
[4429]310 if ($fieldname eq "text") {
311 # add as document text
312 $fielddata=~s@<@&lt;@g;
313 $fielddata=~s@>@&gt;@g; # for xml protection...
314 $fielddata=~s@_@\\_@g; # for macro language protection...
315 $doc_obj->add_utf8_text($cursection, $fielddata);
316 } elsif ($fieldname eq "Identifier") {
317 # use as greenstone's unique record id
318 if ($fielddata =~ /^\d+$/) {
319 # don't allow IDs that are completely numeric
320 $unique_id="id" . $fielddata;
321 } else {
322 $unique_id=$fielddata;
323 }
324 } else {
325 # add as document metadata
326 $fielddata=~s/\[/&#91;/g;
327 $fielddata=~s/\]/&#93;/g;
328 $doc_obj->add_utf8_metadata($cursection,
329 $fieldname, $fielddata);
330
331 }
332 }
333
[13315]334
[4429]335 if (!defined $unique_id) {
[17026]336 if (!defined $base_oid) {
337 $self->add_OID($doc_obj);
338 $base_oid = $doc_obj->get_OID();
339 }
340 $doc_obj->set_OID($base_oid."s$count");
[4429]341 } else {
342 # use our id from the database...
343 $doc_obj->set_OID($unique_id);
344 }
345
346
347 # process the document
348 $processor->process($doc_obj);
349
350 $count++;
351
352 # get next row
353 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
354 } # end of row_array is not empty
355
356 # check "$sth->err" if empty array for error
357 if ($statement_hand->err) {
[17720]358 print $outhandle "DatabasePlugin: received error: \"" .
[7685]359 $statement_hand->errstr . "\"\n";
[4429]360 }
361
362 # clean up connection to database
363 $statement_hand->finish();
364 $dbhandle->disconnect();
365
366 # num of input files, rather than documents created?
367 $self->{'num_processed'}++;
368
[8119]369 if (defined($dbplug_debug) && $dbplug_debug==1) {
[17720]370 print $outhandle "DatabasePlugin: imported $count DB records as documents.\n";
[8119]371 }
372 $count;
[4429]373}
374
3751;
Note: See TracBrowser for help on using the repository browser.