root/gsdl/trunk/perllib/plugins/DatabasePlugin.pm @ 18327

Revision 18327, 11.3 KB (checked in by ak19, 11 years ago)

Extra parameter to new doc(): the renaming method to be used on the file (base64 or URL encoding).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# DatabasePlugin.pm -- plugin to import records from a database
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# A plugin that imports records from a database. This uses perl's DBI module,
28# which includes back-ends for mysql, postgresql, comma separated values (CSV),
29# MS Excel, ODBC, sybase, etc... Extra modules may need to be installed to
30# use this. See <GSDLHOME>/etc/packages/example.dbi for an example config file.
31#
32
33# Written by John McPherson for the NZDL project
34# Mar, Apr 2003
35
36package DatabasePlugin;
37
38use strict;
39no strict 'refs'; # allow variable as a filehandle
40
41use AutoExtractMetadata;
42use unicode;
43
44sub BEGIN {
45    @DatabasePlugin::ISA = ('AutoExtractMetadata');
46}
47
48my $arguments =
49    [ { 'name' => "process_exp",
50    'desc' => "{BasePlugin.process_exp}",
51    'type' => "regexp",
52    'deft' => &get_default_process_exp(),
53    'reqd' => "no" }];
54
55my $options = { 'name'     => "DatabasePlugin",
56        'desc'     => "{DatabasePlugin.desc}",
57        'abstract' => "no",
58        'inherits' => "yes",
59        'args'     => $arguments };
60
61sub new {
62    my ($class) = shift (@_);
63    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
64    push(@$pluginlist, $class);
65
66    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67    push(@{$hashArgOptLists->{"OptList"}},$options);
68
69    my $self = new AutoExtractMetadata($pluginlist, $inputargs, $hashArgOptLists);
70
71    return bless $self, $class;
72}
73
74sub get_default_process_exp {
75    my $self = shift (@_);
76
77    return q^(?i)\.dbi$^;
78}
79
80sub read {
81    my $self = shift (@_);
82    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs,$total_count,$gli) = @_;
83       
84     #see if we can handle the passed file...
85    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
86    return undef unless $self->can_process_this_file($filename_full_path);
87   
88    my $outhandle = $self->{'outhandle'};
89    my $verbosity = $self->{'verbosity'};
90
91    print STDERR "<Processing n='$file' p='DatabasePlugin'>\n" if ($gli);
92    print $outhandle "DatabasePlugin: processing $file\n"
93    if $self->{'verbosity'} > 1;
94   
95    require DBI; # database independent stuff
96
97    # calculate the document hash, for document ids
98    my $hash="0";
99
100   
101    # default options - may be overridden by config file
102    my $language=undef;
103    my $encoding=undef;
104    my $dbplug_debug=0;
105    my $username='';
106    my $password='';
107
108    # these settings must be set by the config file:
109    my $db=undef;
110
111    # get id of pages from "nonempty", get latest version number from
112    # "recent", and then get pagename from "page" and content from "version" !
113
114    my $sql_query_prime = undef ;
115    my $sql_query = undef ;
116
117    my %db_to_greenstone_fields=();
118    my %callbacks=();
119
120
121    # read in config file.
122    if (!open (CONF, $filename_full_path)) {
123        print $outhandle "DatabasePlugin: can't read $filename_full_path: $!\n";
124        return 0;
125    }
126   
127    my $line;
128    my $statement="";
129    my $callback="";
130    while (defined($line=<CONF>)) {
131    chomp $line;
132    $line .= " "; # for multi-line statements - don't conjoin!
133    $line =~ s/\s*\#.*$//mg; # remove comments
134    $statement .= $line;
135
136    if ($line =~ /^\}\s*$/ && $callback) { # ends the callback
137        $callback .= $statement ; $statement = "";
138        # try to check that the function is "safe"
139        if ($callback  =~ /\b(?:system|open|pipe|readpipe|qx|kill|eval|do|use|require|exec|fork)\b/ ||
140        $callback =~ /[\`]|\|\-/) {
141        # no backticks or functions that start new processes allowed
142        print $outhandle "DatabasePlugin: bad function in callback\n";
143        return 0;
144        }
145        $callback =~ s/sub (\w+?)_callback/sub/;
146        my $fieldname = $1;
147        my $ret = eval "\$callbacks{'$fieldname'} = $callback ; 1";
148        if (!defined($ret)) {
149        print $outhandle "DatabasePlugin: error eval'ing callback: $@\n";
150        exit(1);
151        }
152        $callback="";
153        print $outhandle "DatabasePlugin: callback registered for '$fieldname'\n"
154            if $dbplug_debug;
155    } elsif ($callback) {
156        # add this line to the callback function
157        $callback .= $statement;
158        $statement = "";
159    } elsif ($statement =~ m/;\s*$/) { # ends with ";"
160        # check that it is safe
161        # assignment
162        if ($statement =~ m~(\$\w+)\s* = \s*
163        (\d     # digits
164         | ".*?(?<!\\)" # " up to the next " not preceded by a \
165         | '.*?(?<!\\)' # ' up to the next ' not preceded by a \
166        )\s*;~x ||      # /x means ignore comments and whitespace in rx
167        $statement =~ m~(\%\w+)\s*=\s*(\([\w\s\"\',:=>]+\))\s*;~ ) {   
168        # evaluate the assignment, return 1 on success "
169        if (!eval "$1=$2; 1") {
170            my $err=$@;
171            chomp $err;
172            $err =~ s/\.$//; # remove a trailing .
173            print $outhandle "DatabasePlugin: error evaluating `$statement'\n";
174            print $outhandle " $err (in $filename_full_path)\n";
175            return 0; # there was an error reading the config file
176        }
177        } elsif ($statement =~ /sub \w+_callback/) {
178        # this is the start of a callback function definition
179        $callback = $statement;
180        $statement = "";
181        } else {
182        print $outhandle "DatabasePlugin: skipping statement `$statement'\n";
183        }
184        $statement = "";
185    }
186    }
187    close CONF;
188
189   
190    if (!defined($db)) {
191    print $outhandle "DatabasePlugin: error: $filename_full_path does not specify a db!\n";
192    return 0;
193    }
194    if (!defined($sql_query)) {
195        print $outhandle "DatabasePlugin: error: no SQL query specified!\n";
196    return 0;
197    }
198    # connect to database
199    my $dbhandle=DBI->connect($db, $username, $password);
200
201    if (!defined($dbhandle)) {
202    die "DatabasePlugin: could not connect to database, exiting.\n";
203    }
204    if (defined($dbplug_debug) && $dbplug_debug==1) {
205    print $outhandle "DatabasePlugin (debug): connected ok\n";
206    }
207
208    my $statement_hand;
209
210    # The user gave 2 sql statements to execute?
211    if ($sql_query_prime) {
212        $statement_hand=$dbhandle->prepare($sql_query_prime);
213        $statement_hand->execute;
214        if ($statement_hand->err) {
215            print $outhandle "Error: " . $statement_hand->errstr . "\n";
216            return undef;
217        }
218    }
219
220 
221    $statement_hand=$dbhandle->prepare($sql_query);
222    $statement_hand->execute;
223    if ($statement_hand->err) {
224        print $outhandle "Error:" . $statement_hand->errstr . "\n";
225    return undef;
226    }
227
228    # get the array-ref for the field names and cast it to array
229    my @field_names;
230    @field_names=@{ $statement_hand->{NAME} };
231
232    foreach my $fieldname (@field_names) {
233    if (defined($db_to_greenstone_fields{$fieldname})) {
234        if (defined($dbplug_debug) && $dbplug_debug==1) {
235        print $outhandle "DatabasePlugin (debug): mapping db field "
236            . "'$fieldname' to "
237            . $db_to_greenstone_fields{$fieldname} . "\n";
238        }
239        $fieldname=$db_to_greenstone_fields{$fieldname};
240    }
241    }
242
243    # get rows
244
245    my $count = 0;
246    my @row_array;
247
248    @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
249
250    my $base_oid = undef;
251    while (scalar(@row_array)) {
252    if (defined($dbplug_debug) && $dbplug_debug==1) {
253        print $outhandle "DatabasePlugin (debug): retrieved a row from query\n";
254    }
255
256    # create a new document
257    my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'});
258
259    my $cursection = $doc_obj->get_top_section();
260
261    # if $language not set in config file, will use BasePlugin's default
262    if (defined($language)) {
263        $doc_obj->add_utf8_metadata($cursection, "Language", $language);
264    }
265    # if $encoding not set in config file, will use BasePlugin's default
266    if (defined($encoding)) {
267        # allow some common aliases
268        if ($encoding =~ m/^utf[-_]8$/i) {$encoding="utf8"}
269        $encoding =~ s/-/_/g; # greenstone uses eg iso_8859_1
270        $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
271    }
272    $self->set_Source_metadata($doc_obj, $db, $encoding);
273
274    if ($self->{'cover_image'}) {
275        $self->associate_cover_image($doc_obj, $filename_full_path);
276    }
277    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
278
279    $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "DB");
280
281    # include any metadata passed in from previous plugins
282    # note that this metadata is associated with the top level section
283    $self->extra_metadata ($doc_obj, $cursection,
284                   $metadata);
285
286    # do any automatic metadata extraction
287    $self->auto_extract_metadata ($doc_obj);
288
289    my $unique_id=undef;
290
291    foreach my $fieldname (@field_names) {
292        my $fielddata=shift @row_array;
293
294        if (! defined($fielddata) ) {
295            next; # this field was "" or NULL
296        }
297        # use the specified encoding, defaulting to utf-8
298        if (defined($encoding) && $encoding ne "ascii"
299            && $encoding ne "utf8") {
300          $fielddata=&unicode::unicode2utf8(
301            &unicode::convert2unicode($encoding, \$fielddata)
302                           );
303        }
304        # see if we have a ****_callback() function defined
305        if (exists $callbacks{$fieldname}) {
306        my $funcptr = $callbacks{$fieldname};
307        $fielddata = &$funcptr($fielddata);
308        }
309
310        if ($fieldname eq "text") {
311        # add as document text
312        $fielddata=~s@<@&lt;@g;
313        $fielddata=~s@>@&gt;@g; # for xml protection...
314        $fielddata=~s@_@\\_@g; # for macro language protection...
315        $doc_obj->add_utf8_text($cursection, $fielddata);
316        } elsif ($fieldname eq "Identifier") {
317        # use as greenstone's unique record id
318        if ($fielddata =~ /^\d+$/) {
319            # don't allow IDs that are completely numeric
320            $unique_id="id" . $fielddata;
321        } else {
322            $unique_id=$fielddata;
323        }
324        } else {
325        # add as document metadata
326        $fielddata=~s/\[/&#91;/g;
327        $fielddata=~s/\]/&#93;/g;
328        $doc_obj->add_utf8_metadata($cursection,
329                        $fieldname, $fielddata);
330
331        }
332    }
333
334
335    if (!defined $unique_id) {
336        if (!defined $base_oid) {
337        $self->add_OID($doc_obj);
338        $base_oid = $doc_obj->get_OID();
339        }
340        $doc_obj->set_OID($base_oid."s$count");
341    } else {
342        # use our id from the database...
343        $doc_obj->set_OID($unique_id);
344    }
345
346
347        # process the document
348    $processor->process($doc_obj);
349
350    $count++;
351
352    # get next row
353    @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
354    } # end of row_array is not empty
355
356    # check "$sth->err" if empty array for error
357    if ($statement_hand->err) {
358    print $outhandle "DatabasePlugin: received error: \"" .
359        $statement_hand->errstr . "\"\n";
360    }
361
362    # clean up connection to database
363    $statement_hand->finish();
364    $dbhandle->disconnect();
365
366    # num of input files, rather than documents created?
367    $self->{'num_processed'}++;
368
369    if (defined($dbplug_debug) && $dbplug_debug==1) {
370        print $outhandle "DatabasePlugin: imported $count DB records as documents.\n";
371    }
372    $count;
373}
374
3751;
Note: See TracBrowser for help on using the browser.