source: trunk/gsdl/perllib/plugins/DBPlug.pm@ 9853

Last change on this file since 9853 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1###########################################################################
2#
3# DBPlug.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27#
28# See <GSDLHOME>/etc/packages/example.dbi for an example config file!!
29#
30
31# Written by John McPherson for the NZDL project
32# Mar, Apr 2003
33
34package DBPlug;
35
36use strict;
37no strict 'refs'; # allow variable as a filehandle
38
39use BasPlug;
40use unicode;
41use parsargv;
42
43use DBI; # database independent stuff
44
45sub BEGIN {
46 @DBPlug::ISA = ('BasPlug');
47}
48
49my $arguments =
50 [ { 'name' => "process_exp",
51 'desc' => "{BasPlug.process_exp}",
52 'type' => "regexp",
53 'deft' => &get_default_process_exp(),
54 'reqd' => "no" }];
55
56my $options = { 'name' => "DBPlug",
57 'desc' => "{DBPlug.desc}",
58 'abstract' => "no",
59 'inherits' => "yes",
60 'args' => $arguments };
61
62sub new {
63 my ($class) = @_;
64 my $self = new BasPlug ($class, @_);
65 $self->{'plugin_type'} = "DBPlug";
66 my $option_list = $self->{'option_list'};
67 push( @{$option_list}, $options );
68
69 # no plugin-specific options
70# if (!parsargv::parse(\@_, "allow_extra_options")) {
71# $self->print_txt_usage(""); # Use default resource bundle
72# die "\n";
73# }
74
75
76 return bless $self, $class;
77}
78
79sub get_default_process_exp {
80 my $self = shift (@_);
81
82 return q^(?i)\.dbi$^;
83}
84# we don't have a per-greenstone document process() function!
85sub process {
86
87}
88
89
90sub read {
91 my $self = shift (@_);
92 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
93
94 # see if we can handle the passed file...
95 my $filename = $file;
96 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
97 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
98 # this plugin can't process this file type...
99 return undef;
100 }
101
102 my $outhandle = $self->{'outhandle'};
103 my $verbosity = $self->{'verbosity'};
104
105 print $outhandle "DBPlug: processing $file\n"
106 if $self->{'verbosity'} > 1;
107
108 # calculate the document hash, for document ids
109 my $hash="";
110
111 my $osexe = &util::get_os_exe();
112 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
113 $ENV{'GSDLOS'},"hashfile$osexe");
114 if (-e "$hashfile_exe") {
115 $hash = `hashfile$osexe \"$filename\"`;
116 $hash =~ /:\s*([0-9a-f]+)/i;
117 $hash="HASH$1";
118 }
119
120
121 # default options - may be overridden by config file
122 my $language=undef;
123 my $encoding=undef;
124 my $dbplug_debug=0;
125 my $username='';
126 my $password='';
127
128 # these settings must be set by the config file:
129 my $db=undef;
130
131# get id of pages from "nonempty", get latest version number from "recent", and
132# then get pagename from "page" and content from "version" !
133
134 my $sql_query = undef ;
135
136 my %db_to_greenstone_fields=();
137 my %callbacks=();
138
139 # read in config file.
140 if (!open (CONF, $filename)) {
141 print $outhandle "DBPlug: can't read $filename: $!\n";
142 return 0;
143 }
144 my $line;
145 my $statement="";
146 my $callback="";
147 while (defined($line=<CONF>)) {
148 chomp $line;
149 $line =~ s/\s*\#.*$//mg; # remove comments
150 $statement .= $line;
151
152 if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
153 $callback .= $statement ; $statement = "";
154 # try to check that the function is "safe"
155 if ($callback =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
156 $callback =~ /[\`]|\|\-/) {
157 # no backticks or functions that start new processes allowed
158 print $outhandle "DBPlug: bad function in callback\n";
159 return 0;
160 }
161 $callback =~ s/sub (\w+?)_callback/sub/;
162 my $fieldname=$1;
163 eval "\$callbacks{'$fieldname'} = $callback ; 1";
164 $callback="";
165 } elsif ($callback) {
166 # add this line to the callback function
167 $callback .= $statement;
168 $statement = "";
169 } elsif ($statement =~ m/;\s*$/) { # ends with ";"
170 # check that it is safe
171 # assignment
172 if ($statement =~ m~(\$\w+)\s* = \s*
173 (\d # digits
174 | ".*?(?<!\\)" # " up to the next " not preceded by a \
175 | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
176 )\s*;~x || # /x means ignore comments and whitespace in rx
177 $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {
178 # evaluate the assignment, return 1 on success "
179 if (!eval "$1=$2; 1") {
180 my $err=$@;
181 chomp $err;
182 $err =~ s/\.$//; # remove a trailing .
183 print $outhandle "DBPlug: error evaluating `$statement'\n";
184 print $outhandle " $err (in $filename)\n";
185 return 0; # there was an error reading the config file
186 }
187 } elsif ($statement =~ /sub \w+_callback/) {
188 # this is the start of a callback function definition
189 $callback = $statement;
190 $statement = "";
191 } else {
192 print $outhandle "DBPlug: skipping statement `$statement'\n";
193 }
194 $statement = "";
195 }
196 }
197 close CONF;
198
199 if (!defined($db)) {
200 print $outhandle "DBPlug: error: $filename does not specify a db!\n";
201 return 0;
202 }
203 if (!defined($sql_query)) {
204 print $outhandle "DBPlug: error: no SQL query specified!\n";
205 return 0;
206 }
207 # connect to database
208 my $dbhandle=DBI->connect($db, $username, $password);
209
210 if (!defined($dbhandle)) {
211 die "DBPlug: could not connect to database, exiting.\n";
212 }
213 if (defined($dbplug_debug) && $dbplug_debug==1) {
214 print $outhandle "DBPlug (debug): connected ok\n";
215 }
216
217 my $statement_hand=$dbhandle->prepare($sql_query);
218 $statement_hand->execute;
219
220 # get the array-ref for the field names and cast it to array
221 my @field_names;
222 @field_names=@{ $statement_hand->{NAME} };
223
224 foreach my $fieldname (@field_names) {
225 if (defined($db_to_greenstone_fields{$fieldname})) {
226 if (defined($dbplug_debug) && $dbplug_debug==1) {
227 print $outhandle "DBPlug (debug): mapping db field "
228 . "'$fieldname' to "
229 . $db_to_greenstone_fields{$fieldname} . "\n";
230 }
231 $fieldname=$db_to_greenstone_fields{$fieldname};
232 }
233 }
234
235# print "DBPlug: names: " . join (", ", @field_names) . ".\n";
236 # get rows
237
238 my $count = 0;
239 my @row_array;
240
241 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
242
243 while (scalar(@row_array)) {
244 if (defined($dbplug_debug) && $dbplug_debug==1) {
245 print $outhandle "DBPlug (debug): retrieved a row from query\n";
246 }
247
248 # create a new document
249 my $doc_obj = new doc ($filename, "indexed_doc");
250 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
251 my $cursection = $doc_obj->get_top_section();
252
253 # if $language not set in config file, will use BasPlug's default
254 if (defined($language)) {
255 $doc_obj->add_utf8_metadata($cursection, "Language", $language);
256 }
257 # if $encoding not set in config file, will use BasPlug's default
258 if (defined($encoding)) {
259 # allow some common aliases
260 if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
261 $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
262 $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
263 }
264 $doc_obj->add_utf8_metadata($cursection,
265 "Source", &ghtml::dmsafe($db));
266 if ($self->{'cover_image'}) {
267 $self->associate_cover_image($doc_obj, $filename);
268 }
269 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
270
271 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
272
273 # include any metadata passed in from previous plugins
274 # note that this metadata is associated with the top level section
275 $self->extra_metadata ($doc_obj, $cursection,
276 $metadata);
277
278 # do any automatic metadata extraction
279 $self->auto_extract_metadata ($doc_obj);
280
281 my $unique_id=undef;
282
283 foreach my $fieldname (@field_names) {
284 my $fielddata=shift @row_array;
285 # use the specified encoding, defaulting to utf-8
286 if (defined($encoding) && $encoding ne "ascii"
287 && $encoding ne "utf8") {
288 $fielddata=&unicode::unicode2utf8(
289 &unicode::convert2unicode($encoding, \$fielddata)
290 );
291 }
292 # see if we have a ****_callback() function defined
293 if (exists $callbacks{$fieldname}) {
294 my $funcptr = $callbacks{$fieldname};
295 $fielddata = &$funcptr($fielddata);
296 }
297
298 if ($fieldname eq "text") {
299 # add as document text
300 $fielddata=~s@<@&lt;@g;
301 $fielddata=~s@>@&gt;@g; # for xml protection...
302 $fielddata=~s@_@\\_@g; # for macro language protection...
303 $doc_obj->add_utf8_text($cursection, $fielddata);
304 } elsif ($fieldname eq "Identifier") {
305 # use as greenstone's unique record id
306 if ($fielddata =~ /^\d+$/) {
307 # don't allow IDs that are completely numeric
308 $unique_id="id" . $fielddata;
309 } else {
310 $unique_id=$fielddata;
311 }
312 } else {
313 # add as document metadata
314 $fielddata=~s/\[/&#91;/g;
315 $fielddata=~s/\]/&#93;/g;
316 $doc_obj->add_utf8_metadata($cursection,
317 $fieldname, $fielddata);
318
319 }
320 }
321
322 if (!defined $unique_id) {
323 $doc_obj->set_OID($hash . "s$count");
324 } else {
325 # use our id from the database...
326 $doc_obj->set_OID($unique_id);
327 }
328
329
330 # process the document
331 $processor->process($doc_obj);
332
333
334 $count++;
335
336 # get next row
337 @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
338 } # end of row_array is not empty
339
340 # check "$sth->err" if empty array for error
341 if ($statement_hand->err) {
342 print $outhandle "DBPlug: received error: \"" .
343 $statement_hand->errstr . "\"\n";
344 }
345
346 # clean up connection to database
347 $statement_hand->finish();
348 $dbhandle->disconnect();
349
350 # num of input files, rather than documents created?
351 $self->{'num_processed'}++;
352
353 if (defined($dbplug_debug) && $dbplug_debug==1) {
354 print $outhandle "DBPlug: imported $count DB records as documents.\n";
355 }
356 $count;
357}
358
3591;
Note: See TracBrowser for help on using the repository browser.