source: gsdl/trunk/perllib/plugins/DBPlugin.pm@ 16104

Last change on this file since 16104 was 16104, checked in by kjdon, 16 years ago

tried to make the 'xxxplugin processing file' print statements more consistent. They are now done in read (or read_into_doc_obj) and not process

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1###########################################################################
2#
3# DBPlugin.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
36package DBPlugin;
37
38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
41use AutoExtractMetadata;
42use unicode;
43
44sub BEGIN {
45 @DBPlugin::ISA = ('AutoExtractMetadata');
46}
47
48my $arguments =
49 [ { 'name' => "process_exp",
50 'desc' => "{BasePlugin.process_exp}",
51 'type' => "regexp",
52 'deft' => &get_default_process_exp(),
53 'reqd' => "no" }];
54
55my $options = { 'name' => "DBPlugin",
56 'desc' => "{DBPlugin.desc}",
57 'abstract' => "no",
58 'inherits' => "yes",
59 'args' => $arguments };
60
61sub new {
62 my ($class) = shift (@_);
63 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64 push(@$pluginlist, $class);
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)\.dbi$^;
78}
79
80sub read {
81 my $self = shift (@_);
82 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
83
84 #see if we can handle the passed file...
85 my ($block_status,$filename) = $self->read_block(@_);
86 return $block_status if ((!defined $block_status) || ($block_status==0));
87
88 my $outhandle = $self->{'outhandle'};
89 my $verbosity = $self->{'verbosity'};
90
91 print STDERR "<Processing n='$file' p='DBPlugin'>\n" if ($gli);
92 print $outhandle "DBPlugin: processing $file\n"
93 if $self->{'verbosity'} > 1;
94
95 require DBI; # database independent stuff
96
97 # calculate the document hash, for document ids
98 my $hash="0";
99
100
101 # default options - may be overridden by config file
102 my $language=undef;
103 my $encoding=undef;
104 my $dbplug_debug=0;
105 my $username='';
106 my $password='';
107
108 # these settings must be set by the config file:
109 my $db=undef;
110
111 # get id of pages from "nonempty", get latest version number from
112 # "recent", and then get pagename from "page" and content from "version" !
113
114 my $sql_query_prime = undef ;
115 my $sql_query = undef ;
116
117 my %db_to_greenstone_fields=();
118 my %callbacks=();
119
120
121 # read in config file.
122 if (!open (CONF, $filename)) {
123 print $outhandle "DBPlugin: can't read $filename: $!\n";
124 return 0;
125 }
126
127 my $line;
128 my $statement="";
129 my $callback="";
130 while (defined($line=<CONF>)) {
131 chomp $line;
132 $line .= " "; # for multi-line statements - don't conjoin!
133 $line =~ s/\s*\#.*$//mg; # remove comments
134 $statement .= $line;
135
136 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
137 $callback .= $statement ; $statement = "";
138 # try to check that the function is "safe"
139 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
140 $callback =~ /[\`]|\|\-/) {
141 # no backticks or functions that start new processes allowed
142 print $outhandle "DBPlugin: bad function in callback\n";
143 return 0;
144 }
145 $callback =~ s/sub (\w+?)_callback/sub/;
146 my $fieldname = $1;
147 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
148 if (!defined($ret)) {
149 print $outhandle "DBPlugin: error eval'ing callback: $@\n";
150 exit(1);
151 }
152 $callback="";
153 print $outhandle "DBPlugin: callback registered for '$fieldname'\n"
154 if $dbplug_debug;
155 } elsif ($callback) {
156 # add this line to the callback function
157 $callback .= $statement;
158 $statement = "";
159 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
160 # check that it is safe
161 # assignment
162 if ($statement =~ m~(\$\w+)\s* = \s*
163 (\d # digits
164 | ".*?(?<!\\)" # " up to the next " not preceded by a \
165 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
166 )\s*;~x || # /x means ignore comments and whitespace in rx
167 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
168 # evaluate the assignment, return 1 on success "
169 if (!eval "$1=$2; 1") {
170 my $err=$@;
171 chomp $err;
172 $err =~ s/\.$//; # remove a trailing .
173 print $outhandle "DBPlugin: error evaluating `$statement'\n";
174 print $outhandle " $err (in $filename)\n";
175 return 0; # there was an error reading the config file
176 }
177 } elsif ($statement =~ /sub \w+_callback/) {
178 # this is the start of a callback function definition
179 $callback = $statement;
180 $statement = "";
181 } else {
182 print $outhandle "DBPlugin: skipping statement `$statement'\n";
183 }
184 $statement = "";
185 }
186 }
187 close CONF;
188
189
190 if (!defined($db)) {
191 print $outhandle "DBPlugin: error: $filename does not specify a db!\n";
192 return 0;
193 }
194 if (!defined($sql_query)) {
195 print $outhandle "DBPlugin: error: no SQL query specified!\n";
196 return 0;
197 }
198 # connect to database
199 my $dbhandle=DBI->connect($db, $username, $password);
200
201 if (!defined($dbhandle)) {
202 die "DBPlugin: could not connect to database, exiting.\n";
203 }
204 if (defined($dbplug_debug) && $dbplug_debug==1) {
205 print $outhandle "DBPlugin (debug): connected ok\n";
206 }
207
208 my $statement_hand;
209
210 # The user gave 2 sql statements to execute?
211 if ($sql_query_prime) {
212 $statement_hand=$dbhandle->prepare($sql_query_prime);
213 $statement_hand->execute;
214 if ($statement_hand->err) {
215 print $outhandle "Error: " . $statement_hand->errstr . "\n";
216 return undef;
217 }
218 }
219
220
221 $statement_hand=$dbhandle->prepare($sql_query);
222 $statement_hand->execute;
223 if ($statement_hand->err) {
224 print $outhandle "Error:" . $statement_hand->errstr . "\n";
225 return undef;
226 }
227
228 # get the array-ref for the field names and cast it to array
229 my @field_names;
230 @field_names=@{ $statement_hand->{NAME} };
231
232 foreach my $fieldname (@field_names) {
233 if (defined($db_to_greenstone_fields{$fieldname})) {
234 if (defined($dbplug_debug) && $dbplug_debug==1) {
235 print $outhandle "DBPlugin (debug): mapping db field "
236 . "'$fieldname' to "
237 . $db_to_greenstone_fields{$fieldname} . "\n";
238 }
239 $fieldname=$db_to_greenstone_fields{$fieldname};
240 }
241 }
242
243 # get rows
244
245 my $count = 0;
246 my @row_array;
247
248 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
249
250 while (scalar(@row_array)) {
251 if (defined($dbplug_debug) && $dbplug_debug==1) {
252 print $outhandle "DBPlugin (debug): retrieved a row from query\n";
253 }
254
255 # create a new document
256 my $doc_obj = new doc ($filename, "indexed_doc");
257 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
258 my $cursection = $doc_obj->get_top_section();
259
260 # if $language not set in config file, will use BasePlugin's default
261 if (defined($language)) {
262 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
263 }
264 # if $encoding not set in config file, will use BasePlugin's default
265 if (defined($encoding)) {
266 # allow some common aliases
267 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
268 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
269 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
270 }
271 $self->set_Source_metadata($doc_obj, $db, $encoding);
272
273 if ($self->{'cover_image'}) {
274 $self->associate_cover_image($doc_obj, $filename);
275 }
276 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
277
278 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
279
280 # include any metadata passed in from previous plugins
281 # note that this metadata is associated with the top level section
282 $self->extra_metadata ($doc_obj, $cursection,
283 $metadata);
284
285 # do any automatic metadata extraction
286 $self->auto_extract_metadata ($doc_obj);
287
288 my $unique_id=undef;
289
290 foreach my $fieldname (@field_names) {
291 my $fielddata=shift @row_array;
292
293 if (! defined($fielddata) ) {
294 next; # this field was "" or NULL
295 }
296 # use the specified encoding, defaulting to utf-8
297 if (defined($encoding) && $encoding ne "ascii"
298 && $encoding ne "utf8") {
299 $fielddata=&unicode::unicode2utf8(
300 &unicode::convert2unicode($encoding, \$fielddata)
301 );
302 }
303 # see if we have a ****_callback() function defined
304 if (exists $callbacks{$fieldname}) {
305 my $funcptr = $callbacks{$fieldname};
306 $fielddata = &$funcptr($fielddata);
307 }
308
309 if ($fieldname eq "text") {
310 # add as document text
311 $fielddata=~s@<@&lt;@g;
312 $fielddata=~s@>@&gt;@g; # for xml protection...
313 $fielddata=~s@_@\\_@g; # for macro language protection...
314 $doc_obj->add_utf8_text($cursection, $fielddata);
315 } elsif ($fieldname eq "Identifier") {
316 # use as greenstone's unique record id
317 if ($fielddata =~ /^\d+$/) {
318 # don't allow IDs that are completely numeric
319 $unique_id="id" . $fielddata;
320 } else {
321 $unique_id=$fielddata;
322 }
323 } else {
324 # add as document metadata
325 $fielddata=~s/\[/&#91;/g;
326 $fielddata=~s/\]/&#93;/g;
327 $doc_obj->add_utf8_metadata($cursection,
328 $fieldname, $fielddata);
329
330 }
331 }
332
333
334 if (!defined $unique_id) {
335 $doc_obj->set_OID();
336 my $id = $doc_obj->get_OID();
337 $doc_obj->set_OID($id."s$count");
338 } else {
339 # use our id from the database...
340 $doc_obj->set_OID($unique_id);
341 }
342
343
344 # process the document
345 $processor->process($doc_obj);
346
347 $count++;
348
349 # get next row
350 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
351 } # end of row_array is not empty
352
353 # check "$sth->err" if empty array for error
354 if ($statement_hand->err) {
355 print $outhandle "DBPlugin: received error: \"" .
356 $statement_hand->errstr . "\"\n";
357 }
358
359 # clean up connection to database
360 $statement_hand->finish();
361 $dbhandle->disconnect();
362
363 # num of input files, rather than documents created?
364 $self->{'num_processed'}++;
365
366 if (defined($dbplug_debug) && $dbplug_debug==1) {
367 print $outhandle "DBPlugin: imported $count DB records as documents.\n";
368 }
369 $count;
370}
371
3721;
Note: See TracBrowser for help on using the repository browser.