source: gsdl/trunk/perllib/dbutil.pm@ 17110

Last change on this file since 17110 was 17105, checked in by mdewsnip, 16 years ago

Not sure why "gdbm-txtgz" was made the default, particularly since collections built with this don't work for me. Changing default back to "gdbm".

File size: 12.3 KB
Line 
1###########################################################################
2#
3# dbutil.pm -- utility functions for writing to different databases
4# Copyright (C) 2008 DL Consulting Ltd
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package dbutil;
27
28use strict;
29
30
31sub open_infodb_write_handle
32{
33 my $infodb_type = shift(@_);
34 my $infodb_file_path = shift(@_);
35
36 if ($infodb_type eq "sqlite")
37 {
38 return &open_infodb_write_handle_sqlite($infodb_file_path);
39 }
40 elsif ($infodb_type eq "gdbm-txtgz")
41 {
42 return &open_infodb_write_handle_gdbm_txtgz($infodb_file_path);
43 }
44
45 # Use GDBM if the infodb type is empty or not one of the values above
46 return &open_infodb_write_handle_gdbm($infodb_file_path);
47}
48
49
50sub close_infodb_write_handle
51{
52 my $infodb_type = shift(@_);
53 my $infodb_handle = shift(@_);
54
55 if ($infodb_type eq "sqlite")
56 {
57 return &close_infodb_write_handle_sqlite($infodb_handle);
58 }
59 elsif ($infodb_type eq "gdbm-txtgz")
60 {
61 return &close_infodb_write_handle_gdbm_txtgz($infodb_handle);
62 }
63
64 # Use GDBM if the infodb type is empty or not one of the values above
65 &close_infodb_write_handle_gdbm($infodb_handle);
66}
67
68
69sub get_default_infodb_type
70{
71 # The default is GDBM so everything works the same for existing collections
72 # To use something else, specify the "infodbtype" in the collection's collect.cfg file
73 return "gdbm";
74}
75
76
77sub get_infodb_file_path
78{
79 my $infodb_type = shift(@_);
80 my $collection_name = shift(@_);
81 my $infodb_directory_path = shift(@_);
82
83 if ($infodb_type eq "sqlite")
84 {
85 return &get_infodb_file_path_sqlite($collection_name, $infodb_directory_path);
86 }
87 elsif ($infodb_type eq "gdbm-txtgz")
88 {
89 return &get_infodb_file_path_gdbm_txtgz($collection_name, $infodb_directory_path);
90 }
91
92 # Use GDBM if the infodb type is empty or not one of the values above
93 return &get_infodb_file_path_gdbm($collection_name, $infodb_directory_path);
94}
95
96
97sub read_infodb_file
98{
99 my $infodb_type = shift(@_);
100 my $infodb_file_path = shift(@_);
101 my $infodb_map = shift(@_);
102
103 if ($infodb_type eq "sqlite")
104 {
105 return &read_infodb_file_sqlite($infodb_file_path, $infodb_map);
106 }
107 elsif ($infodb_type eq "gdbm-txtgz")
108 {
109 return &read_infodb_file_gdbm_txtgz($infodb_file_path, $infodb_map);
110 }
111
112 # Use GDBM if the infodb type is empty or not one of the values above
113 return &read_infodb_file_gdbm($infodb_file_path, $infodb_map);
114}
115
116
117sub write_infodb_entry
118{
119 my $infodb_type = shift(@_);
120 my $infodb_handle = shift(@_);
121 my $infodb_key = shift(@_);
122 my $infodb_map = shift(@_);
123
124 if ($infodb_type eq "sqlite")
125 {
126 return &write_infodb_entry_sqlite($infodb_handle, $infodb_key, $infodb_map);
127 }
128 elsif ($infodb_type eq "gdbm-txtgz")
129 {
130 return &write_infodb_entry_gdbm_txtgz($infodb_handle, $infodb_key, $infodb_map);
131 }
132
133 # Use GDBM if the infodb type is empty or not one of the values above
134 return &write_infodb_entry_gdbm($infodb_handle, $infodb_key, $infodb_map);
135}
136
137
138
139# -----------------------------------------------------------------------------
140# GDBM TXT-GZ IMPLEMENTATION
141# -----------------------------------------------------------------------------
142
143sub open_infodb_write_handle_gdbm_txtgz
144{
145 # Keep infodb in GDBM neutral form => save data as compressed text file,
146 # read for txt2db to be run on it later (i.e. by the runtime system,
147 # first time the collection is ever accessed). This makes it easier
148 # distribute pre-built collections to various architectures.
149 #
150 # NB: even if two architectures are little endian (e.g. Intel and
151 # ARM procesors) GDBM does *not* guarantee that the database generated on
152 # one will work on the other
153
154 my $infodb_file_path = shift(@_);
155
156 # Greenstone ships with gzip for windows, on $PATH
157
158 my $infodb_file_handle = undef;
159 if (!open($infodb_file_handle, "| gzip - > \"$infodb_file_path\""))
160 {
161 return undef;
162 }
163
164 return $infodb_file_handle;
165}
166
167
168sub close_infodb_write_handle_gdbm_txtgz
169{
170 my $infodb_handle = shift(@_);
171
172 close($infodb_handle);
173}
174
175
176sub get_infodb_file_path_gdbm_txtgz
177{
178 my $collection_name = shift(@_);
179 my $infodb_directory_path = shift(@_);
180
181 my $infodb_file_name = &util::get_dirsep_tail($collection_name).".txt.gz";
182 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
183}
184
185
186sub read_infodb_file_gdbm_txtgz
187{
188 my $infodb_file_path = shift(@_);
189 my $infodb_map = shift(@_);
190
191 my $cmd = "gzip --decompress \"$infodb_file_path\"";
192
193 open (PIPEIN, "$cmd |")
194 || die "Error: Couldn't open pipe from gzip: $!\n $cmd\n";
195
196 my $infodb_line = "";
197 my $infodb_key = "";
198 my $infodb_value = "";
199 while (defined ($infodb_line = <PIPEIN>))
200 {
201 if ($infodb_line =~ /^\[([^\]]+)\]$/)
202 {
203 $infodb_key = $1;
204 }
205 elsif ($infodb_line =~ /^-{70}$/)
206 {
207 $infodb_map->{$infodb_key} = $infodb_value;
208 $infodb_key = "";
209 $infodb_value = "";
210 }
211 else
212 {
213 $infodb_value .= $infodb_line;
214 }
215 }
216
217 close (PIPEIN);
218}
219
220
221sub write_infodb_entry_gdbm_txtgz
222{
223
224 my $infodb_handle = shift(@_);
225 my $infodb_key = shift(@_);
226 my $infodb_map = shift(@_);
227
228 print $infodb_handle "[$infodb_key]\n";
229 foreach my $infodb_value_key (keys(%$infodb_map))
230 {
231 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
232 {
233 if ($infodb_value =~ /-{70,}/)
234 {
235 # if value contains 70 or more hyphens in a row we need to escape them
236 # to prevent txt2db from treating them as a separator
237 $infodb_value =~ s/-/&\#045;/gi;
238 }
239 print $infodb_handle "<$infodb_value_key>" . $infodb_value . "\n";
240 }
241 }
242 print $infodb_handle '-' x 70, "\n";
243}
244
245
246
247# -----------------------------------------------------------------------------
248# GDBM IMPLEMENTATION
249# -----------------------------------------------------------------------------
250
251sub open_infodb_write_handle_gdbm
252{
253 my $infodb_file_path = shift(@_);
254
255 my $txt2db_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "txt2db" . &util::get_os_exe());
256 my $infodb_file_handle = undef;
257 if (!-e "$txt2db_exe" || !open($infodb_file_handle, "| \"$txt2db_exe\" \"$infodb_file_path\""))
258 {
259 return undef;
260 }
261
262 return $infodb_file_handle;
263}
264
265
266sub close_infodb_write_handle_gdbm
267{
268 my $infodb_handle = shift(@_);
269
270 close($infodb_handle);
271}
272
273
274sub get_infodb_file_path_gdbm
275{
276 my $collection_name = shift(@_);
277 my $infodb_directory_path = shift(@_);
278
279 my $infodb_file_extension = (&util::is_little_endian() ? ".ldb" : ".bdb");
280 my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
281 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
282}
283
284
285sub read_infodb_file_gdbm
286{
287 my $infodb_file_path = shift(@_);
288 my $infodb_map = shift(@_);
289
290 open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt\n";
291 my $infodb_line = "";
292 my $infodb_key = "";
293 my $infodb_value = "";
294 while (defined ($infodb_line = <PIPEIN>))
295 {
296 if ($infodb_line =~ /^\[([^\]]+)\]$/)
297 {
298 $infodb_key = $1;
299 }
300 elsif ($infodb_line =~ /^-{70}$/)
301 {
302 $infodb_map->{$infodb_key} = $infodb_value;
303 $infodb_key = "";
304 $infodb_value = "";
305 }
306 else
307 {
308 $infodb_value .= $infodb_line;
309 }
310 }
311
312 close (PIPEIN);
313}
314
315
316sub write_infodb_entry_gdbm
317{
318 # With infodb_handle already set up, works the same as _gdbm_txtgz version
319 write_infodb_entry_gdbm_txtgz(@_);
320}
321
322
323
324# -----------------------------------------------------------------------------
325# SQLITE IMPLEMENTATION
326# -----------------------------------------------------------------------------
327
328sub open_infodb_write_handle_sqlite
329{
330 my $infodb_file_path = shift(@_);
331
332 my $sqlite3_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "sqlite3" . &util::get_os_exe());
333 my $infodb_handle = undef;
334 if (!-e "$sqlite3_exe" || !open($infodb_handle, "| \"$sqlite3_exe\" \"$infodb_file_path\""))
335 {
336 return undef;
337 }
338
339 print $infodb_handle "CREATE TABLE IF NOT EXISTS data (key TEXT PRIMARY KEY, value TEXT);\n";
340 print $infodb_handle "CREATE TABLE IF NOT EXISTS document_metadata (id INTEGER PRIMARY KEY, docOID TEXT, element TEXT, value TEXT);\n";
341
342 # This is crucial for efficiency when importing large amounts of data
343 print $infodb_handle "CREATE INDEX IF NOT EXISTS dmd ON document_metadata(docOID);\n";
344
345 # This is very important for efficiency, otherwise each command will be actioned one at a time
346 print $infodb_handle "BEGIN TRANSACTION;\n";
347
348 return $infodb_handle;
349}
350
351
352sub close_infodb_write_handle_sqlite
353{
354 my $infodb_handle = shift(@_);
355
356 # Close the transaction we began after opening the file
357 print $infodb_handle "END TRANSACTION;\n";
358
359 # This is crucial for efficient queries on the database!
360 print $infodb_handle "CREATE INDEX IF NOT EXISTS dme ON document_metadata(element);\n";
361
362 close($infodb_handle);
363}
364
365
366sub get_infodb_file_path_sqlite
367{
368 my $collection_name = shift(@_);
369 my $infodb_directory_path = shift(@_);
370
371 my $infodb_file_extension = ".db";
372 my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
373 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
374}
375
376
377sub read_infodb_file_sqlite
378{
379 my $infodb_file_path = shift(@_);
380 my $infodb_map = shift(@_);
381
382 # !! TO IMPLEMENT
383}
384
385
386sub write_infodb_entry_sqlite
387{
388 my $infodb_handle = shift(@_);
389 my $infodb_key = shift(@_);
390 my $infodb_map = shift(@_);
391
392 # Add the key -> value mapping into the "data" table
393 my $infodb_entry_value = "";
394 foreach my $infodb_value_key (keys(%$infodb_map))
395 {
396 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
397 {
398 $infodb_entry_value .= "<$infodb_value_key>" . $infodb_value . "\n";
399 }
400 }
401
402 my $safe_infodb_key = &sqlite_safe($infodb_key);
403 print $infodb_handle "INSERT OR REPLACE INTO data (key, value) VALUES ('" . $safe_infodb_key . "', '" . &sqlite_safe($infodb_entry_value) . "');\n";
404
405 # If this infodb entry is for a document, add all the interesting document metadata to the
406 # "document_metadata" table (for use by the dynamic classifiers)
407 if ($infodb_key !~ /\./ && $infodb_entry_value =~ /\<doctype\>doc\n/)
408 {
409 print $infodb_handle "DELETE FROM document_metadata WHERE docOID='" . $safe_infodb_key . "';\n";
410
411 foreach my $infodb_value_key (keys(%$infodb_map))
412 {
413 # We're not interested in most of the automatically added document metadata
414 next if ($infodb_value_key eq "archivedir" ||
415 $infodb_value_key eq "assocfilepath" ||
416 $infodb_value_key eq "childtype" ||
417 $infodb_value_key eq "contains" ||
418 $infodb_value_key eq "docnum" ||
419 $infodb_value_key eq "doctype" ||
420 $infodb_value_key eq "Encoding" ||
421 $infodb_value_key eq "FileSize" ||
422 $infodb_value_key eq "hascover" ||
423 $infodb_value_key eq "hastxt" ||
424 $infodb_value_key eq "lastmodified" ||
425 $infodb_value_key eq "metadataset" ||
426 $infodb_value_key eq "thistype" ||
427 $infodb_value_key =~ /^metadatafreq\-/ ||
428 $infodb_value_key =~ /^metadatalist\-/);
429
430 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
431 {
432 print $infodb_handle "INSERT INTO document_metadata (docOID, element, value) VALUES ('" . $safe_infodb_key . "', '" . &sqlite_safe($infodb_value_key) . "', '" . &sqlite_safe($infodb_value) . "');\n";
433 }
434 }
435 }
436}
437
438
439sub sqlite_safe
440{
441 my $value = shift(@_);
442
443 # Escape any single quotes in the value
444 $value =~ s/\'/\'\'/g;
445
446 return $value;
447}
448
449
4501;
Note: See TracBrowser for help on using the repository browser.