source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/DatabasePlugin.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 11.6 KB
Line 
1###########################################################################
2#
3# DatabasePlugin.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
36package DatabasePlugin;
37
38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
41use AutoExtractMetadata;
42use MetadataRead;
43use unicode;
44
45sub BEGIN {
46 @DatabasePlugin::ISA = ('MetadataRead', 'AutoExtractMetadata');
47}
48
49my $arguments =
50 [ { 'name' => "process_exp",
51 'desc' => "{BasePlugin.process_exp}",
52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
55
56my $options = { 'name' => "DatabasePlugin",
57 'desc' => "{DatabasePlugin.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments };
61
62sub new {
63 my ($class) = shift (@_);
64 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
65 push(@$pluginlist, $class);
66
67 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
68 push(@{$hashArgOptLists->{"OptList"}},$options);
69
70 my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
71
72 return bless $self, $class;
73}
74
75sub get_default_process_exp {
76 my $self = shift (@_);
77
78 return q^(?i)\.dbi$^;
79}
80
81sub read {
82 my $self = shift (@_);
83 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
84
85 #see if we can handle the passed file...
86 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
87 return undef unless $self->can_process_this_file($filename_full_path);
88
89 my $outhandle = $self->{'outhandle'};
90 my $verbosity = $self->{'verbosity'};
91
92 print STDERR "<Processing n='$file' p='DatabasePlugin'>\n" if ($gli);
93 print $outhandle "DatabasePlugin: processing $file\n"
94 if $self->{'verbosity'} > 1;
95
96 require DBI; # database independent stuff
97
98 # calculate the document hash, for document ids
99 my $hash="0";
100
101
102 # default options - may be overridden by config file
103 my $language=undef;
104 my $encoding=undef;
105 my $dbplug_debug=0;
106 my $username='';
107 my $password='';
108
109 # these settings must be set by the config file:
110 my $db=undef;
111
112 # get id of pages from "nonempty", get latest version number from
113 # "recent", and then get pagename from "page" and content from "version" !
114
115 my $sql_query_prime = undef ;
116 my $sql_query = undef ;
117
118 my %db_to_greenstone_fields=();
119 my %callbacks=();
120
121
122 # read in config file.
123 if (!open (CONF, $filename_full_path)) {
124 print $outhandle "DatabasePlugin: can't read $filename_full_path: $!\n";
125 return 0;
126 }
127
128 my $line;
129 my $statement="";
130 my $callback="";
131 while (defined($line=<CONF>)) {
132 chomp $line;
133 $line .= " "; # for multi-line statements - don't conjoin!
134 $line =~ s/\s*\#.*$//mg; # remove comments
135 $statement .= $line;
136
137 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
138 $callback .= $statement ; $statement = "";
139 # try to check that the function is "safe"
140 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
141 $callback =~ /[\`]|\|\-/) {
142 # no backticks or functions that start new processes allowed
143 print $outhandle "DatabasePlugin: bad function in callback\n";
144 return 0;
145 }
146 $callback =~ s/sub (\w+?)_callback/sub/;
147 my $fieldname = $1;
148 my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
149 if (!defined($ret)) {
150 print $outhandle "DatabasePlugin: error eval'ing callback: $@\n";
151 exit(1);
152 }
153 $callback="";
154 print $outhandle "DatabasePlugin: callback registered for '$fieldname'\n"
155 if $dbplug_debug;
156 } elsif ($callback) {
157 # add this line to the callback function
158 $callback .= $statement;
159 $statement = "";
160 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
161 # check that it is safe
162 # assignment
163 if ($statement =~ m~(\$\w+)\s* = \s*
164 (\d # digits
165 | ".*?(?<!\\)" # " up to the next " not preceded by a \
166 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
167 )\s*;~x || # /x means ignore comments and whitespace in rx
168 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
169 # evaluate the assignment, return 1 on success "
170 if (!eval "$1=$2; 1") {
171 my $err=$@;
172 chomp $err;
173 $err =~ s/\.$//; # remove a trailing .
174 print $outhandle "DatabasePlugin: error evaluating `$statement'\n";
175 print $outhandle " $err (in $filename_full_path)\n";
176 return 0; # there was an error reading the config file
177 }
178 } elsif ($statement =~ /sub \w+_callback/) {
179 # this is the start of a callback function definition
180 $callback = $statement;
181 $statement = "";
182 } else {
183 print $outhandle "DatabasePlugin: skipping statement `$statement'\n";
184 }
185 $statement = "";
186 }
187 }
188 close CONF;
189
190
191 if (!defined($db)) {
192 print $outhandle "DatabasePlugin: error: $filename_full_path does not specify a db!\n";
193 return 0;
194 }
195 if (!defined($sql_query)) {
196 print $outhandle "DatabasePlugin: error: no SQL query specified!\n";
197 return 0;
198 }
199 # connect to database
200 my $dbhandle=DBI->connect($db, $username, $password);
201
202 if (!defined($dbhandle)) {
203 die "DatabasePlugin: could not connect to database, exiting.\n";
204 }
205 if (defined($dbplug_debug) && $dbplug_debug==1) {
206 print $outhandle "DatabasePlugin (debug): connected ok\n";
207 }
208
209 my $statement_hand;
210
211 # The user gave 2 sql statements to execute?
212 if ($sql_query_prime) {
213 $statement_hand=$dbhandle->prepare($sql_query_prime);
214 $statement_hand->execute;
215 if ($statement_hand->err) {
216 print $outhandle "Error: " . $statement_hand->errstr . "\n";
217 return undef;
218 }
219 }
220
221
222 $statement_hand=$dbhandle->prepare($sql_query);
223 $statement_hand->execute;
224 if ($statement_hand->err) {
225 print $outhandle "Error:" . $statement_hand->errstr . "\n";
226 return undef;
227 }
228
229 # get the array-ref for the field names and cast it to array
230 my @field_names;
231 @field_names=@{ $statement_hand->{NAME} };
232
233 foreach my $fieldname (@field_names) {
234 if (defined($db_to_greenstone_fields{$fieldname})) {
235 if (defined($dbplug_debug) && $dbplug_debug==1) {
236 print $outhandle "DatabasePlugin (debug): mapping db field "
237 . "'$fieldname' to "
238 . $db_to_greenstone_fields{$fieldname} . "\n";
239 }
240 $fieldname=$db_to_greenstone_fields{$fieldname};
241 }
242 }
243
244 # get rows
245
246 my $count = 0;
247 my @row_array;
248
249 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
250
251 my $base_oid = undef;
252 while (scalar(@row_array)) {
253 if (defined($dbplug_debug) && $dbplug_debug==1) {
254 print $outhandle "DatabasePlugin (debug): retrieved a row from query\n";
255 }
256
257 # create a new document
258 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
259
260 my $cursection = $doc_obj->get_top_section();
261
262 # if $language not set in config file, will use BasePlugin's default
263 if (defined($language)) {
264 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
265 }
266 # if $encoding not set in config file, will use BasePlugin's default
267 if (defined($encoding)) {
268 # allow some common aliases
269 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
270 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
271 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
272 }
273
274 my $plugin_filename_encoding = $self->{'filename_encoding'};
275 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
276 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding);
277
278 if ($self->{'cover_image'}) {
279 $self->associate_cover_image($doc_obj, $filename_full_path);
280 }
281 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
282
283 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
284
285 # include any metadata passed in from previous plugins
286 # note that this metadata is associated with the top level section
287 $self->extra_metadata ($doc_obj, $cursection,
288 $metadata);
289
290 # do any automatic metadata extraction
291 $self->auto_extract_metadata ($doc_obj);
292
293 my $unique_id=undef;
294
295 foreach my $fieldname (@field_names) {
296 my $fielddata=shift @row_array;
297
298 if (! defined($fielddata) ) {
299 next; # this field was "" or NULL
300 }
301 # use the specified encoding, defaulting to utf-8
302 if (defined($encoding) && $encoding ne "ascii"
303 && $encoding ne "utf8") {
304 $fielddata=&unicode::unicode2utf8(
305 &unicode::convert2unicode($encoding, \$fielddata)
306 );
307 }
308 # see if we have a ****_callback() function defined
309 if (exists $callbacks{$fieldname}) {
310 my $funcptr = $callbacks{$fieldname};
311 $fielddata = &$funcptr($fielddata);
312 }
313
314 if ($fieldname eq "text") {
315 # add as document text
316 $fielddata=~s@<@&lt;@g;
317 $fielddata=~s@>@&gt;@g; # for xml protection...
318 $fielddata=~s@_@\\_@g; # for macro language protection...
319 $doc_obj->add_utf8_text($cursection, $fielddata);
320 } elsif ($fieldname eq "Identifier") {
321 # use as greenstone's unique record id
322 if ($fielddata =~ /^\d+$/) {
323 # don't allow IDs that are completely numeric
324 $unique_id="id" . $fielddata;
325 } else {
326 $unique_id=$fielddata;
327 }
328 } else {
329 # add as document metadata
330 $fielddata=~s/\[/&#91;/g;
331 $fielddata=~s/\]/&#93;/g;
332 $doc_obj->add_utf8_metadata($cursection,
333 $fieldname, $fielddata);
334
335 }
336 }
337
338
339 if (!defined $unique_id) {
340 if (!defined $base_oid) {
341 $self->add_OID($doc_obj);
342 $base_oid = $doc_obj->get_OID();
343 }
344 $doc_obj->set_OID($base_oid."s$count");
345 } else {
346 # use our id from the database...
347 $doc_obj->set_OID($unique_id);
348 }
349
350
351 # process the document
352 $processor->process($doc_obj);
353
354 $count++;
355
356 # get next row
357 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
358 } # end of row_array is not empty
359
360 # check "$sth->err" if empty array for error
361 if ($statement_hand->err) {
362 print $outhandle "DatabasePlugin: received error: \"" .
363 $statement_hand->errstr . "\"\n";
364 }
365
366 # clean up connection to database
367 $statement_hand->finish();
368 $dbhandle->disconnect();
369
370 # num of input files, rather than documents created?
371 $self->{'num_processed'}++;
372
373 if (defined($dbplug_debug) && $dbplug_debug==1) {
374 print $outhandle "DatabasePlugin: imported $count DB records as documents.\n";
375 }
376 $count;
377}
378
3791;
Note: See TracBrowser for help on using the repository browser.