1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # DBPlug.pm -- plugin to import records from a database
|
---|
4 | #
|
---|
5 | # A component of the Greenstone digital library software
|
---|
6 | # from the New Zealand Digital Library Project at the
|
---|
7 | # University of Waikato, New Zealand.
|
---|
8 | #
|
---|
9 | # Copyright (C) 2003 New Zealand Digital Library Project
|
---|
10 | #
|
---|
11 | # This program is free software; you can redistribute it and/or modify
|
---|
12 | # it under the terms of the GNU General Public License as published by
|
---|
13 | # the Free Software Foundation; either version 2 of the License, or
|
---|
14 | # (at your option) any later version.
|
---|
15 | #
|
---|
16 | # This program is distributed in the hope that it will be useful,
|
---|
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | # GNU General Public License for more details.
|
---|
20 | #
|
---|
21 | # You should have received a copy of the GNU General Public License
|
---|
22 | # along with this program; if not, write to the Free Software
|
---|
23 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | #
|
---|
25 | ###########################################################################
|
---|
26 |
|
---|
27 | #
|
---|
28 | # See <GSDLHOME>/etc/packages/example.dbi for an example config file!!
|
---|
29 | #
|
---|
30 |
|
---|
31 | # Written by John McPherson for the NZDL project
|
---|
32 | # Mar, Apr 2003
|
---|
33 |
|
---|
34 | package DBPlug;
|
---|
35 |
|
---|
36 | use BasPlug;
|
---|
37 | use unicode;
|
---|
38 | use parsargv;
|
---|
39 |
|
---|
40 | use DBI; # database independent stuff
|
---|
41 |
|
---|
42 | sub BEGIN {
|
---|
43 | @ISA = ('BasPlug');
|
---|
44 | }
|
---|
45 |
|
---|
46 | my $arguments =
|
---|
47 | [ { 'name' => "process_exp",
|
---|
48 | 'desc' => "{BasPlug.process_exp}",
|
---|
49 | 'type' => "regexp",
|
---|
50 | 'deft' => &get_default_process_exp(),
|
---|
51 | 'reqd' => "no" }];
|
---|
52 |
|
---|
53 | my $options = { 'name' => "DBPlug",
|
---|
54 | 'desc' => "{DBPlug.desc}",
|
---|
55 | 'abstract' => "no",
|
---|
56 | 'inherits' => "yes",
|
---|
57 | 'args' => $arguments };
|
---|
58 |
|
---|
59 | sub new {
|
---|
60 | my ($class) = @_;
|
---|
61 | my $self = new BasPlug ($class, @_);
|
---|
62 | $self->{'plugin_type'} = "DBPlug";
|
---|
63 | my $option_list = $self->{'option_list'};
|
---|
64 | push( @{$option_list}, $options );
|
---|
65 |
|
---|
66 | # no plugin-specific options
|
---|
67 | # if (!parsargv::parse(\@_, "allow_extra_options")) {
|
---|
68 | # $self->print_txt_usage(""); # Use default resource bundle
|
---|
69 | # die "\n";
|
---|
70 | # }
|
---|
71 |
|
---|
72 |
|
---|
73 | return bless $self, $class;
|
---|
74 | }
|
---|
75 |
|
---|
76 | sub get_default_process_exp {
|
---|
77 | my $self = shift (@_);
|
---|
78 |
|
---|
79 | return q^(?i)\.dbi$^;
|
---|
80 | }
|
---|
81 | # we don't have a per-greenstone document process() function!
|
---|
82 | sub process {
|
---|
83 |
|
---|
84 | }
|
---|
85 |
|
---|
86 |
|
---|
87 | sub read {
|
---|
88 | my $self = shift (@_);
|
---|
89 | my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
|
---|
90 |
|
---|
91 | # see if we can handle the passed file...
|
---|
92 | my $filename = $file;
|
---|
93 | $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
|
---|
94 | if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
|
---|
95 | # this plugin can't process this file type...
|
---|
96 | return undef;
|
---|
97 | }
|
---|
98 |
|
---|
99 | my $outhandle = $self->{'outhandle'};
|
---|
100 | my $verbosity = $self->{'verbosity'};
|
---|
101 |
|
---|
102 | print $outhandle "DBPlug: processing $file\n"
|
---|
103 | if $self->{'verbosity'} > 1;
|
---|
104 |
|
---|
105 | # calculate the document hash, for document ids
|
---|
106 | my $hash="";
|
---|
107 |
|
---|
108 | my $osexe = &util::get_os_exe();
|
---|
109 | my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
|
---|
110 | $ENV{'GSDLOS'},"hashfile$osexe");
|
---|
111 | if (-e "$hashfile_exe") {
|
---|
112 | $hash = `hashfile$osexe \"$filename\"`;
|
---|
113 | $hash =~ /:\s*([0-9a-f]+)/i;
|
---|
114 | $hash="HASH$1";
|
---|
115 | }
|
---|
116 |
|
---|
117 |
|
---|
118 | # default options - may be overridden by config file
|
---|
119 | my $language=undef;
|
---|
120 | my $encoding=undef;
|
---|
121 | my $dbplug_debug=0;
|
---|
122 | my $username='';
|
---|
123 | my $password='';
|
---|
124 |
|
---|
125 | # these settings must be set by the config file:
|
---|
126 | my $db=undef;
|
---|
127 |
|
---|
128 | # get id of pages from "nonempty", get latest version number from "recent", and
|
---|
129 | # then get pagename from "page" and content from "version" !
|
---|
130 |
|
---|
131 | my $sql_query = undef ;
|
---|
132 |
|
---|
133 | my %db_to_greenstone_fields=();
|
---|
134 |
|
---|
135 | # read in config file.
|
---|
136 | eval `cat $filename`;
|
---|
137 |
|
---|
138 | if (!defined($db)) {
|
---|
139 | print $outhandle "DBPlug: error: $filename does not specify a db!\n";
|
---|
140 | return 0;
|
---|
141 | }
|
---|
142 | if (!defined($sql_query)) {
|
---|
143 | print $outhandle "DBPlug: error: no SQL query specified!\n";
|
---|
144 | return 0;
|
---|
145 | }
|
---|
146 | # connect to database
|
---|
147 | my $dbhandle=DBI->connect($db, $username, $password);
|
---|
148 |
|
---|
149 | if (!defined($dbhandle)) {
|
---|
150 | die "DBPlug: could not connect to database, exiting.\n";
|
---|
151 | }
|
---|
152 | if (defined($dbplug_debug) && $dbplug_debug==1) {
|
---|
153 | print $outhandle "DBPlug (debug): connected ok\n";
|
---|
154 | }
|
---|
155 |
|
---|
156 | my $statement_hand=$dbhandle->prepare($sql_query);
|
---|
157 | $statement_hand->execute;
|
---|
158 |
|
---|
159 | # get the array-ref for the field names and cast it to array
|
---|
160 | my @field_names;
|
---|
161 | @field_names=@{ $statement_hand->{NAME} };
|
---|
162 |
|
---|
163 | foreach my $fieldname (@field_names) {
|
---|
164 | if (defined($db_to_greenstone_fields{$fieldname})) {
|
---|
165 | if (defined($dbplug_debug) && $dbplug_debug==1) {
|
---|
166 | print $outhandle "DBPlug (debug): mapping db field "
|
---|
167 | . "'$fieldname' to "
|
---|
168 | . $db_to_greenstone_fields{$fieldname} . "\n";
|
---|
169 | }
|
---|
170 | $fieldname=$db_to_greenstone_fields{$fieldname};
|
---|
171 | }
|
---|
172 | }
|
---|
173 |
|
---|
174 |
|
---|
175 | # print "DBPlug: names: " . join (", ", @field_names) . ".\n";
|
---|
176 | # get rows
|
---|
177 |
|
---|
178 | my $count = 0;
|
---|
179 | my @row_array;
|
---|
180 |
|
---|
181 | @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
|
---|
182 |
|
---|
183 | while (scalar(@row_array)) {
|
---|
184 | if (defined($dbplug_debug) && $dbplug_debug==1) {
|
---|
185 | print $outhandle "DBPlug (debug): retrieved a row from query\n";
|
---|
186 | }
|
---|
187 |
|
---|
188 | # create a new document
|
---|
189 | my $doc_obj = new doc ($filename, "indexed_doc");
|
---|
190 | $doc_obj->set_OIDtype ($processor->{'OIDtype'});
|
---|
191 | my $cursection = $doc_obj->get_top_section();
|
---|
192 |
|
---|
193 | if (defined($language)) {
|
---|
194 | # if not set in config file, will use BasPlug's default
|
---|
195 | $doc_obj->add_utf8_metadata($cursection, "Language", $language);
|
---|
196 | }
|
---|
197 | if (defined($encoding)) {
|
---|
198 | # if not set in config file, will use BasPlug's default
|
---|
199 | $doc_obj->add_utf8_metadata($cursection, "Encoding", $encoding);
|
---|
200 | }
|
---|
201 | $doc_obj->add_utf8_metadata($cursection,
|
---|
202 | "Source", &ghtml::dmsafe($db));
|
---|
203 | if ($self->{'cover_image'}) {
|
---|
204 | $self->associate_cover_image($doc_obj, $filename);
|
---|
205 | }
|
---|
206 | $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
|
---|
207 |
|
---|
208 |
|
---|
209 | # include any metadata passed in from previous plugins
|
---|
210 | # note that this metadata is associated with the top level section
|
---|
211 | $self->extra_metadata ($doc_obj, $cursection,
|
---|
212 | $metadata);
|
---|
213 |
|
---|
214 | # do any automatic metadata extraction
|
---|
215 | $self->auto_extract_metadata ($doc_obj);
|
---|
216 |
|
---|
217 | my $unique_id=undef;
|
---|
218 |
|
---|
219 | foreach my $fieldname (@field_names) {
|
---|
220 | my $fielddata=shift @row_array;
|
---|
221 | # use the specified encoding, defaulting to utf-8
|
---|
222 | if (defined($encoding) && $encoding ne "ascii"
|
---|
223 | && $encoding ne "utf8") {
|
---|
224 | $fielddata=&unicode::unicode2utf8(
|
---|
225 | &unicode::convert2unicode($encoding, \$fielddata)
|
---|
226 | );
|
---|
227 | }
|
---|
228 | if ($fieldname eq "text") {
|
---|
229 | # see if we have a text_callback() function defined
|
---|
230 | if (defined(&text_callback)) {
|
---|
231 | $fielddata=text_callback($fielddata);
|
---|
232 | }
|
---|
233 | # add as document text
|
---|
234 | $fielddata=~s@<@<@g;
|
---|
235 | $fielddata=~s@>@>@g; # for xml protection...
|
---|
236 | $fielddata=~s@_@\\_@g; # for macro language protection...
|
---|
237 | $doc_obj->add_utf8_text($cursection, $fielddata);
|
---|
238 | } elsif ($fieldname eq "Identifier") {
|
---|
239 | # use as greenstone's unique record id
|
---|
240 | if ($fielddata =~ /^\d+$/) {
|
---|
241 | # don't allow IDs that are completely numeric
|
---|
242 | $unique_id="id" . $fielddata;
|
---|
243 | } else {
|
---|
244 | $unique_id=$fielddata;
|
---|
245 | }
|
---|
246 | } else {
|
---|
247 | # add as document metadata
|
---|
248 | $fielddata=~s/\[/[/g;
|
---|
249 | $fielddata=~s/\]/]/g;
|
---|
250 | $doc_obj->add_utf8_metadata($cursection,
|
---|
251 | $fieldname, $fielddata);
|
---|
252 |
|
---|
253 | }
|
---|
254 | }
|
---|
255 |
|
---|
256 | if (!defined $unique_id) {
|
---|
257 | $doc_obj->set_OID($hash . "s$count");
|
---|
258 | } else {
|
---|
259 | # use our id from the database...
|
---|
260 | $doc_obj->set_OID($unique_id);
|
---|
261 | }
|
---|
262 |
|
---|
263 |
|
---|
264 | # process the document
|
---|
265 | $processor->process($doc_obj);
|
---|
266 |
|
---|
267 |
|
---|
268 | $count++;
|
---|
269 |
|
---|
270 | # get next row
|
---|
271 | @row_array=$statement_hand->fetchrow_array; # fetchrow_hashref?
|
---|
272 | } # end of row_array is not empty
|
---|
273 |
|
---|
274 | # check "$sth->err" if empty array for error
|
---|
275 | if ($statement_hand->err) {
|
---|
276 | print "received error: \"" . $statement_hand->errstr . "\"\n";
|
---|
277 | }
|
---|
278 |
|
---|
279 | # clean up connection to database
|
---|
280 | $statement_hand->finish();
|
---|
281 | $dbhandle->disconnect();
|
---|
282 |
|
---|
283 | # num of input files, rather than documents created?
|
---|
284 | $self->{'num_processed'}++;
|
---|
285 |
|
---|
286 | return $count;
|
---|
287 | }
|
---|
288 |
|
---|
289 | 1;
|
---|