source: main/trunk/greenstone2/perllib/plugins/DatabasePlugin.pm@ 28782

Last change on this file since 28782 was 24548, checked in by ak19, 13 years ago

Part 2 of previous commit (r24547). Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1###########################################################################
2#
3# DatabasePlugin.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
36package DatabasePlugin;
37
38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
41use AutoExtractMetadata;
42use MetadataRead;
43use unicode;
44
45sub BEGIN {
46 @DatabasePlugin::ISA = ('MetadataRead', 'AutoExtractMetadata');
47}
48
49my $arguments =
50 [ { 'name' => "process_exp",
51 'desc' => "{BasePlugin.process_exp}",
52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
55
56my $options = { 'name' => "DatabasePlugin",
57 'desc' => "{DatabasePlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments };
61
62sub new {
63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
66
67 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
69
70 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
71
72 return bless $self, $class;
73}
74
75sub get_default_process_exp {
76 my $self = shift (@_);
77
78 return q^(?i)\.dbi$^;
79}
80
81sub read {
82 my $self = shift (@_);
83 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
84
85 #see if we can handle the passed file...
86 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
87 return undef unless $self->can_process_this_file($filename_full_path);
88
89 my $outhandle = $self->{'outhandle'};
90 my $verbosity = $self->{'verbosity'};
91
92 print STDERR "<Processing n='$file' p='DatabasePlugin'>\n" if ($gli);
93 print $outhandle "DatabasePlugin: processing $file\n"
94 if $self->{'verbosity'} > 1;
95
96 require DBI; # database independent stuff
97
98 # calculate the document hash, for document ids
99 my $hash="0";
100
101
102 # default options - may be overridden by config file
103 my $language=undef;
104 my $encoding=undef;
105 my $dbplug_debug=0;
106 my $username='';
107 my $password='';
108
109 # these settings must be set by the config file:
110 my $db=undef;
111
112 # get id of pages from "nonempty", get latest version number from
113 # "recent", and then get pagename from "page" and content from "version" !
114
115 my $sql_query_prime = undef ;
116 my $sql_query = undef ;
117
118 my %db_to_greenstone_fields=();
119 my %callbacks=();
120
121
122 # read in config file.
123 if (!open (CONF, $filename_full_path)) {
124 print $outhandle "DatabasePlugin: can't read $filename_full_path: $!\n";
125 return 0;
126 }
127
128 my $line;
129 my $statement="";
130 my $callback="";
131 while (defined($line=<CONF>)) {
132 chomp $line;
133 $line .= " "; # for multi-line statements - don't conjoin!
134 $line =~ s/\s*\#.*$//mg; # remove comments
135 $statement .= $line;
136
137 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
138 $callback .= $statement ; $statement = "";
139 # try to check that the function is "safe"
140 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
141 $callback =~ /[\`]|\|\-/) {
142 # no backticks or functions that start new processes allowed
143 print $outhandle "DatabasePlugin: bad function in callback\n";
144 return 0;
145 }
146 $callback =~ s/sub (\w+?)_callback/sub/;
147 my $fieldname = $1;
148 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
149 if (!defined($ret)) {
150 print $outhandle "DatabasePlugin: error eval'ing callback: $@\n";
151 exit(1);
152 }
153 $callback="";
154 print $outhandle "DatabasePlugin: callback registered for '$fieldname'\n"
155 if $dbplug_debug;
156 } elsif ($callback) {
157 # add this line to the callback function
158 $callback .= $statement;
159 $statement = "";
160 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
161 # check that it is safe
162 # assignment
163 if ($statement =~ m~(\$\w+)\s* = \s*
164 (\d # digits
165 | ".*?(?<!\\)" # " up to the next " not preceded by a \
166 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
167 )\s*;~x || # /x means ignore comments and whitespace in rx
168 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
169 # evaluate the assignment, return 1 on success "
170 if (!eval "$1=$2; 1") {
171 my $err=$@;
172 chomp $err;
173 $err =~ s/\.$//; # remove a trailing .
174 print $outhandle "DatabasePlugin: error evaluating `$statement'\n";
175 print $outhandle " $err (in $filename_full_path)\n";
176 return 0; # there was an error reading the config file
177 }
178 } elsif ($statement =~ /sub \w+_callback/) {
179 # this is the start of a callback function definition
180 $callback = $statement;
181 $statement = "";
182 } else {
183 print $outhandle "DatabasePlugin: skipping statement `$statement'\n";
184 }
185 $statement = "";
186 }
187 }
188 close CONF;
189
190
191 if (!defined($db)) {
192 print $outhandle "DatabasePlugin: error: $filename_full_path does not specify a db!\n";
193 return 0;
194 }
195 if (!defined($sql_query)) {
196 print $outhandle "DatabasePlugin: error: no SQL query specified!\n";
197 return 0;
198 }
199 # connect to database
200 my $dbhandle=DBI->connect($db, $username, $password);
201
202 if (!defined($dbhandle)) {
203 die "DatabasePlugin: could not connect to database, exiting.\n";
204 }
205 if (defined($dbplug_debug) && $dbplug_debug==1) {
206 print $outhandle "DatabasePlugin (debug): connected ok\n";
207 }
208
209 my $statement_hand;
210
211 # The user gave 2 sql statements to execute?
212 if ($sql_query_prime) {
213 $statement_hand=$dbhandle->prepare($sql_query_prime);
214 $statement_hand->execute;
215 if ($statement_hand->err) {
216 print $outhandle "Error: " . $statement_hand->errstr . "\n";
217 return undef;
218 }
219 }
220
221
222 $statement_hand=$dbhandle->prepare($sql_query);
223 $statement_hand->execute;
224 if ($statement_hand->err) {
225 print $outhandle "Error:" . $statement_hand->errstr . "\n";
226 return undef;
227 }
228
229 # get the array-ref for the field names and cast it to array
230 my @field_names;
231 @field_names=@{ $statement_hand->{NAME} };
232
233 foreach my $fieldname (@field_names) {
234 if (defined($db_to_greenstone_fields{$fieldname})) {
235 if (defined($dbplug_debug) && $dbplug_debug==1) {
236 print $outhandle "DatabasePlugin (debug): mapping db field "
237 . "'$fieldname' to "
238 . $db_to_greenstone_fields{$fieldname} . "\n";
239 }
240 $fieldname=$db_to_greenstone_fields{$fieldname};
241 }
242 }
243
244 # get rows
245
246 my $count = 0;
247 my @row_array;
248
249 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
250
251 my $base_oid = undef;
252 while (scalar(@row_array)) {
253 if (defined($dbplug_debug) && $dbplug_debug==1) {
254 print $outhandle "DatabasePlugin (debug): retrieved a row from query\n";
255 }
256
257 # create a new document
258 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
259
260 my $cursection = $doc_obj->get_top_section();
261
262 # if $language not set in config file, will use BasePlugin's default
263 if (defined($language)) {
264 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
265 }
266 # if $encoding not set in config file, will use BasePlugin's default
267 if (defined($encoding)) {
268 # allow some common aliases
269 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
270 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
271 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
272 }
273
274 my $plugin_filename_encoding = $self->{'filename_encoding'};
275 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
276 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
277
278 if ($self->{'cover_image'}) {
279 $self->associate_cover_image($doc_obj, $filename_full_path);
280 }
281 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
282
283 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
284
285 # include any metadata passed in from previous plugins
286 # note that this metadata is associated with the top level section
287 $self->extra_metadata ($doc_obj, $cursection,
288 $metadata);
289
290 # do any automatic metadata extraction
291 $self->auto_extract_metadata ($doc_obj);
292
293 my $unique_id=undef;
294
295 foreach my $fieldname (@field_names) {
296 my $fielddata=shift @row_array;
297
298 if (! defined($fielddata) ) {
299 next; # this field was "" or NULL
300 }
301 # use the specified encoding, defaulting to utf-8
302 if (defined($encoding) && $encoding ne "ascii"
303 && $encoding ne "utf8") {
304 $fielddata=&unicode::unicode2utf8(
305 &unicode::convert2unicode($encoding, \$fielddata)
306 );
307 }
308 # see if we have a ****_callback() function defined
309 if (exists $callbacks{$fieldname}) {
310 my $funcptr = $callbacks{$fieldname};
311 $fielddata = &$funcptr($fielddata);
312 }
313
314 if ($fieldname eq "text") {
315 # add as document text
316 $fielddata=~s@<@&lt;@g;
317 $fielddata=~s@>@&gt;@g; # for xml protection...
318 $fielddata=~s@_@\\_@g; # for macro language protection...
319 $doc_obj->add_utf8_text($cursection, $fielddata);
320 } elsif ($fieldname eq "Identifier") {
321 # use as greenstone's unique record id
322 if ($fielddata =~ /^\d+$/) {
323 # don't allow IDs that are completely numeric
324 $unique_id="id" . $fielddata;
325 } else {
326 $unique_id=$fielddata;
327 }
328 } else {
329 # add as document metadata
330 $fielddata=~s/\[/&#91;/g;
331 $fielddata=~s/\]/&#93;/g;
332 $doc_obj->add_utf8_metadata($cursection,
333 $fieldname, $fielddata);
334
335 }
336 }
337
338
339 if (!defined $unique_id) {
340 if (!defined $base_oid) {
341 $self->add_OID($doc_obj);
342 $base_oid = $doc_obj->get_OID();
343 }
344 $doc_obj->set_OID($base_oid."s$count");
345 } else {
346 # use our id from the database...
347 $doc_obj->set_OID($unique_id);
348 }
349
350
351 # process the document
352 $processor->process($doc_obj);
353
354 $count++;
355
356 # get next row
357 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
358 } # end of row_array is not empty
359
360 # check "$sth->err" if empty array for error
361 if ($statement_hand->err) {
362 print $outhandle "DatabasePlugin: received error: \"" .
363 $statement_hand->errstr . "\"\n";
364 }
365
366 # clean up connection to database
367 $statement_hand->finish();
368 $dbhandle->disconnect();
369
370 # num of input files, rather than documents created?
371 $self->{'num_processed'}++;
372
373 if (defined($dbplug_debug) && $dbplug_debug==1) {
374 print $outhandle "DatabasePlugin: imported $count DB records as documents.\n";
375 }
376 $count;
377}
378
3791;
Note: See TracBrowser for help on using the repository browser.