source: trunk/gsdl/perllib/plugins/DBPlug.pm@ 13315

Last change on this file since 13315 was 13315, checked in by shaoqun, 17 years ago

uses the stardand method to set doc ID and new param list for the read method

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1###########################################################################
2#
3# DBPlug.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
36package DBPlug;
37
38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
41use BasPlug;
42use unicode;
43
44#use DBI; # database independent stuff
45
46sub BEGIN {
47 @DBPlug::ISA = ('BasPlug');
48}
49
50my $arguments =
51 [ { 'name' => "process_exp",
52 'desc' => "{BasPlug.process_exp}",
53 'type' => "regexp",
54 'deft' => &get_default_process_exp(),
55 'reqd' => "no" }];
56
57my $options = { 'name' => "DBPlug",
58 'desc' => "{DBPlug.desc}",
59 'abstract' => "no",
60 'inherits' => "yes",
61 'args' => $arguments };
62
63sub new {
64 my ($class) = shift (@_);
65 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
66 push(@$pluginlist, $class);
67
68 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
69 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
70
71 my $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
72
73 return bless $self, $class;
74}
75
76sub get_default_process_exp {
77 my $self = shift (@_);
78
79 return q^(?i)\.dbi$^;
80}
81# we don't have a per-greenstone document process() function!
82sub process {
83
84}
85
86sub read {
87 my $self = shift (@_);
88 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
89
90 #see if we can handle the passed file...
91 my ($block_status,$filename) = $self->read_block(@_);
92 return $block_status if ((!defined $block_status) || ($block_status==0));
93
94 my $outhandle = $self->{'outhandle'};
95 my $verbosity = $self->{'verbosity'};
96
97 print $outhandle "DBPlug: processing $file\n"
98 if $self->{'verbosity'} > 1;
99
100 require DBI; # database independent stuff
101
102 # calculate the document hash, for document ids
103 my $hash="0";
104
105
106 # default options - may be overridden by config file
107 my $language=undef;
108 my $encoding=undef;
109 my $dbplug_debug=0;
110 my $username='';
111 my $password='';
112
113 # these settings must be set by the config file:
114 my $db=undef;
115
116# get id of pages from "nonempty", get latest version number from "recent", and
117# then get pagename from "page" and content from "version" !
118
119 my $sql_query_prime = undef ;
120 my $sql_query = undef ;
121
122 my %db_to_greenstone_fields=();
123 my %callbacks=();
124
125
126 # read in config file.
127 if (!open (CONF, $filename)) {
128 print $outhandle "DBPlug: can't read $filename: $!\n";
129 return 0;
130 }
131
132 my $line;
133 my $statement="";
134 my $callback="";
135 while (defined($line=<CONF>)) {
136 chomp $line;
137 $line .= " "; # for multi-line statements - don't conjoin!
138 $line =~ s/\s*\#.*$//mg; # remove comments
139 $statement .= $line;
140
141 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
142 $callback .= $statement ; $statement = "";
143 # try to check that the function is "safe"
144 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
145 $callback =~ /[\`]|\|\-/) {
146 # no backticks or functions that start new processes allowed
147 print $outhandle "DBPlug: bad function in callback\n";
148 return 0;
149 }
150 $callback =~ s/sub (\w+?)_callback/sub/;
151 my $fieldname = $1;
152 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
153 if (!defined($ret)) {
154 print $outhandle "DBPlug: error eval'ing callback: $@\n";
155 exit(1);
156 }
157 $callback="";
158 print $outhandle "DBPlug: callback registered for '$fieldname'\n"
159 if $dbplug_debug;
160 } elsif ($callback) {
161 # add this line to the callback function
162 $callback .= $statement;
163 $statement = "";
164 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
165 # check that it is safe
166 # assignment
167 if ($statement =~ m~(\$\w+)\s* = \s*
168 (\d # digits
169 | ".*?(?<!\\)" # " up to the next " not preceded by a \
170 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
171 )\s*;~x || # /x means ignore comments and whitespace in rx
172 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
173 # evaluate the assignment, return 1 on success "
174 if (!eval "$1=$2; 1") {
175 my $err=$@;
176 chomp $err;
177 $err =~ s/\.$//; # remove a trailing .
178 print $outhandle "DBPlug: error evaluating `$statement'\n";
179 print $outhandle " $err (in $filename)\n";
180 return 0; # there was an error reading the config file
181 }
182 } elsif ($statement =~ /sub \w+_callback/) {
183 # this is the start of a callback function definition
184 $callback = $statement;
185 $statement = "";
186 } else {
187 print $outhandle "DBPlug: skipping statement `$statement'\n";
188 }
189 $statement = "";
190 }
191 }
192 close CONF;
193
194
195 if (!defined($db)) {
196 print $outhandle "DBPlug: error: $filename does not specify a db!\n";
197 return 0;
198 }
199 if (!defined($sql_query)) {
200 print $outhandle "DBPlug: error: no SQL query specified!\n";
201 return 0;
202 }
203 # connect to database
204 my $dbhandle=DBI->connect($db, $username, $password);
205
206 if (!defined($dbhandle)) {
207 die "DBPlug: could not connect to database, exiting.\n";
208 }
209 if (defined($dbplug_debug) && $dbplug_debug==1) {
210 print $outhandle "DBPlug (debug): connected ok\n";
211 }
212
213 my $statement_hand;
214
215 # The user gave 2 sql statements to execute?
216 if ($sql_query_prime) {
217 $statement_hand=$dbhandle->prepare($sql_query_prime);
218 $statement_hand->execute;
219 if ($statement_hand->err) {
220 print $outhandle "Error: " . $statement_hand->errstr . "\n";
221 return undef;
222 }
223 }
224
225
226 $statement_hand=$dbhandle->prepare($sql_query);
227 $statement_hand->execute;
228 if ($statement_hand->err) {
229 print $outhandle "Error:" . $statement_hand->errstr . "\n";
230 return undef;
231 }
232
233 # get the array-ref for the field names and cast it to array
234 my @field_names;
235 @field_names=@{ $statement_hand->{NAME} };
236
237 foreach my $fieldname (@field_names) {
238 if (defined($db_to_greenstone_fields{$fieldname})) {
239 if (defined($dbplug_debug) && $dbplug_debug==1) {
240 print $outhandle "DBPlug (debug): mapping db field "
241 . "'$fieldname' to "
242 . $db_to_greenstone_fields{$fieldname} . "\n";
243 }
244 $fieldname=$db_to_greenstone_fields{$fieldname};
245 }
246 }
247
248 # get rows
249
250 my $count = 0;
251 my @row_array;
252
253 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
254
255 while (scalar(@row_array)) {
256 if (defined($dbplug_debug) && $dbplug_debug==1) {
257 print $outhandle "DBPlug (debug): retrieved a row from query\n";
258 }
259
260 # create a new document
261 my $doc_obj = new doc ($filename, "indexed_doc");
262 $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});
263 my $cursection = $doc_obj->get_top_section();
264
265 # if $language not set in config file, will use BasPlug's default
266 if (defined($language)) {
267 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
268 }
269 # if $encoding not set in config file, will use BasPlug's default
270 if (defined($encoding)) {
271 # allow some common aliases
272 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
273 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
274 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
275 }
276 $doc_obj->add_utf8_metadata($cursection,
277 "Source", &ghtml::dmsafe($db));
278 if ($self->{'cover_image'}) {
279 $self->associate_cover_image($doc_obj, $filename);
280 }
281 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
282
283 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
284
285 # include any metadata passed in from previous plugins
286 # note that this metadata is associated with the top level section
287 $self->extra_metadata ($doc_obj, $cursection,
288 $metadata);
289
290 # do any automatic metadata extraction
291 $self->auto_extract_metadata ($doc_obj);
292
293 my $unique_id=undef;
294
295 foreach my $fieldname (@field_names) {
296 my $fielddata=shift @row_array;
297
298 if (! defined($fielddata) ) {
299 next; # this field was "" or NULL
300 }
301 # use the specified encoding, defaulting to utf-8
302 if (defined($encoding) && $encoding ne "ascii"
303 && $encoding ne "utf8") {
304 $fielddata=&unicode::unicode2utf8(
305 &unicode::convert2unicode($encoding, \$fielddata)
306 );
307 }
308 # see if we have a ****_callback() function defined
309 if (exists $callbacks{$fieldname}) {
310 my $funcptr = $callbacks{$fieldname};
311 $fielddata = &$funcptr($fielddata);
312 }
313
314 if ($fieldname eq "text") {
315 # add as document text
316 $fielddata=~s@<@&lt;@g;
317 $fielddata=~s@>@&gt;@g; # for xml protection...
318 $fielddata=~s@_@\\_@g; # for macro language protection...
319 $doc_obj->add_utf8_text($cursection, $fielddata);
320 } elsif ($fieldname eq "Identifier") {
321 # use as greenstone's unique record id
322 if ($fielddata =~ /^\d+$/) {
323 # don't allow IDs that are completely numeric
324 $unique_id="id" . $fielddata;
325 } else {
326 $unique_id=$fielddata;
327 }
328 } else {
329 # add as document metadata
330 $fielddata=~s/\[/&#91;/g;
331 $fielddata=~s/\]/&#93;/g;
332 $doc_obj->add_utf8_metadata($cursection,
333 $fieldname, $fielddata);
334
335 }
336 }
337
338
339 if (!defined $unique_id) {
340 $doc_obj->set_OID();
341 my $id = $doc_obj->get_OID();
342 $doc_obj->set_OID($id."s$count");
343 } else {
344 # use our id from the database...
345 $doc_obj->set_OID($unique_id);
346 }
347
348
349 # process the document
350 $processor->process($doc_obj);
351
352 $count++;
353
354 # get next row
355 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
356 } # end of row_array is not empty
357
358 # check "$sth->err" if empty array for error
359 if ($statement_hand->err) {
360 print $outhandle "DBPlug: received error: \"" .
361 $statement_hand->errstr . "\"\n";
362 }
363
364 # clean up connection to database
365 $statement_hand->finish();
366 $dbhandle->disconnect();
367
368 # num of input files, rather than documents created?
369 $self->{'num_processed'}++;
370
371 if (defined($dbplug_debug) && $dbplug_debug==1) {
372 print $outhandle "DBPlug: imported $count DB records as documents.\n";
373 }
374 $count;
375}
376
3771;
Note: See TracBrowser for help on using the repository browser.