source: gsdl/trunk/perllib/dbutil.pm@ 17103

Last change on this file since 17103 was 17087, checked in by davidb, 16 years ago

Introduction of new GDBM alternative for archives.inf as step towards full incremental building. Information traditionally stored in archives.inf PLUS additional information that will help with working out what files have changed since last build, and what doc-id they hashed to is stored in two GDBM databases. For now these databases aren't read, but in the future ArchivesInfPlugin will be upgraded to use these to support these.

File size: 12.4 KB
Line 
1###########################################################################
2#
3# dbutil.pm -- utility functions for writing to different databases
4# Copyright (C) 2008 DL Consulting Ltd
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package dbutil;
27
28use strict;
29
30
31sub open_infodb_write_handle
32{
33 my $infodb_type = shift(@_);
34 my $infodb_file_path = shift(@_);
35
36 if ($infodb_type eq "sqlite") {
37 return &open_infodb_write_handle_sqlite($infodb_file_path);
38 }
39 elsif ($infodb_type eq "gdbm") {
40 return &open_infodb_write_handle_gdbm($infodb_file_path);
41 }
42
43 # Use text (gzipped) version ready for convertion to GDBM
44 # if the infodb type is empty or not one of the values above
45 return &open_infodb_write_handle_gdbm_txtgz($infodb_file_path);
46}
47
48
49sub close_infodb_write_handle
50{
51 my $infodb_type = shift(@_);
52 my $infodb_handle = shift(@_);
53
54 if ($infodb_type eq "sqlite") {
55 return &close_infodb_write_handle_sqlite($infodb_handle);
56 }
57 elsif ($infodb_type eq "gdbm") {
58 return &close_infodb_write_handle_gdbm($infodb_handle);
59 }
60
61 # Use text (gzipped) version ready for convertion to GDBM
62 # if the infodb type is empty or not one of the values above return
63
64 &close_infodb_write_handle_gdbm_txtgz($infodb_handle); }
65
66
67sub get_default_infodb_type
68{
69 return "gdbm-txtgz";
70}
71
72
73sub get_infodb_file_path
74{
75 my $infodb_type = shift(@_);
76 my $collection_name = shift(@_);
77 my $infodb_directory_path = shift(@_);
78
79 if ($infodb_type eq "sqlite")
80 {
81 return &get_infodb_file_path_sqlite($collection_name, $infodb_directory_path);
82 }
83 elsif ($infodb_type eq "gdbm") {
84 return &get_infodb_file_path_gdbm($collection_name, $infodb_directory_path);
85 }
86
87 # Use text (gzipped) version ready for convertion to GDBM
88 # if the infodb type is empty or not one of the values above return
89
90 return &get_infodb_file_path_gdbm_txtgz($collection_name, $infodb_directory_path);
91}
92
93
94
95
96sub read_infodb_file
97{
98 my $infodb_type = shift(@_);
99 my $infodb_file_path = shift(@_);
100 my $infodb_map = shift(@_);
101
102 if ($infodb_type eq "sqlite")
103 {
104 return &read_infodb_file_sqlite($infodb_file_path, $infodb_map);
105 }
106 elsif ($infodb_type eq "gdbm") {
107 return &read_infodb_file_gdbm($infodb_file_path, $infodb_map);
108 }
109
110 # Use text (gzipped) version ready for convertion to GDBM
111 # if the infodb type is empty or not one of the values above return
112
113 return &read_infodb_file_gdbm_txtgz($infodb_file_path, $infodb_map);
114}
115
116
117sub write_infodb_entry
118{
119 my $infodb_type = shift(@_);
120 my $infodb_handle = shift(@_);
121 my $infodb_key = shift(@_);
122 my $infodb_map = shift(@_);
123
124 if ($infodb_type eq "sqlite")
125 {
126 return &write_infodb_entry_sqlite($infodb_handle, $infodb_key, $infodb_map);
127 }
128 elsif ($infodb_type eq "gdbm") {
129 return &write_infodb_entry_gdbm($infodb_handle, $infodb_key, $infodb_map);
130 }
131
132 # Use text (gzipped) version ready for convertion to GDBM
133 # if the infodb type is empty or not one of the values above return
134 return &write_infodb_entry_gdbm_txtgz($infodb_handle, $infodb_key, $infodb_map);
135}
136
137
138
139# -----------------------------------------------------------------------------
140# GDBM TXT-GZ IMPLEMENTATION
141# -----------------------------------------------------------------------------
142
143sub open_infodb_write_handle_gdbm_txtgz
144{
145 # Keep infodb in GDBM neutral form => save data as compressed text file,
146 # read for txt2db to be run on it later (i.e. by the runtime system,
147 # first time the collection is ever accessed). This makes it easier
148 # distribute pre-built collections to various architectures.
149 #
150 # NB: even if two architectures are little endian (e.g. Intel and
151 # ARM procesors) GDBM does *not* guarantee that the database generated on
152 # one will work on the other
153
154 my $infodb_file_path = shift(@_);
155
156 # Greenstone ships with gzip for windows, on $PATH
157
158 my $infodb_file_handle = undef;
159 if (!open($infodb_file_handle, "| gzip - > \"$infodb_file_path\""))
160 {
161 return undef;
162 }
163
164 return $infodb_file_handle;
165}
166
167sub close_infodb_write_handle_gdbm_txtgz
168{
169 my $infodb_handle = shift(@_);
170
171 close($infodb_handle);
172}
173
174
175sub get_infodb_file_path_gdbm_txtgz
176{
177 my $collection_name = shift(@_);
178 my $infodb_directory_path = shift(@_);
179
180 my $infodb_file_name = &util::get_dirsep_tail($collection_name).".txt.gz";
181 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
182}
183
184
185sub read_infodb_file_gdbm_txtgz
186{
187 my $infodb_file_path = shift(@_);
188 my $infodb_map = shift(@_);
189
190 my $cmd = "gzip --decompress \"$infodb_file_path\"";
191
192 open (PIPEIN, "$cmd |")
193 || die "Error: Couldn't open pipe from gzip: $!\n $cmd\n";
194
195 my $infodb_line = "";
196 my $infodb_key = "";
197 my $infodb_value = "";
198 while (defined ($infodb_line = <PIPEIN>))
199 {
200 if ($infodb_line =~ /^\[([^\]]+)\]$/)
201 {
202 $infodb_key = $1;
203 }
204 elsif ($infodb_line =~ /^-{70}$/)
205 {
206 $infodb_map->{$infodb_key} = $infodb_value;
207 $infodb_key = "";
208 $infodb_value = "";
209 }
210 else
211 {
212 $infodb_value .= $infodb_line;
213 }
214 }
215
216 close (PIPEIN);
217}
218
219
220sub write_infodb_entry_gdbm_txtgz
221{
222
223 my $infodb_handle = shift(@_);
224 my $infodb_key = shift(@_);
225 my $infodb_map = shift(@_);
226
227 print $infodb_handle "[$infodb_key]\n";
228 foreach my $infodb_value_key (keys(%$infodb_map))
229 {
230 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
231 {
232 if ($infodb_value =~ /-{70,}/)
233 {
234 # if value contains 70 or more hyphens in a row we need to escape them
235 # to prevent txt2db from treating them as a separator
236 $infodb_value =~ s/-/&\#045;/gi;
237 }
238 print $infodb_handle "<$infodb_value_key>" . $infodb_value . "\n";
239 }
240 }
241 print $infodb_handle '-' x 70, "\n";
242}
243
244
245
246# -----------------------------------------------------------------------------
247# GDBM IMPLEMENTATION
248# -----------------------------------------------------------------------------
249
250sub open_infodb_write_handle_gdbm
251{
252 my $infodb_file_path = shift(@_);
253
254 my $txt2db_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "txt2db" . &util::get_os_exe());
255 my $infodb_file_handle = undef;
256 if (!-e "$txt2db_exe" || !open($infodb_file_handle, "| \"$txt2db_exe\" \"$infodb_file_path\""))
257 {
258 return undef;
259 }
260
261 return $infodb_file_handle;
262}
263
264sub close_infodb_write_handle_gdbm
265{
266 my $infodb_handle = shift(@_);
267
268 close($infodb_handle);
269}
270
271
272sub get_infodb_file_path_gdbm
273{
274 my $collection_name = shift(@_);
275 my $infodb_directory_path = shift(@_);
276
277 my $infodb_file_extension = (&util::is_little_endian() ? ".ldb" : ".bdb");
278 my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
279 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
280}
281
282
283
284
285sub read_infodb_file_gdbm
286{
287 my $infodb_file_path = shift(@_);
288 my $infodb_map = shift(@_);
289
290 open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt\n";
291 my $infodb_line = "";
292 my $infodb_key = "";
293 my $infodb_value = "";
294 while (defined ($infodb_line = <PIPEIN>))
295 {
296 if ($infodb_line =~ /^\[([^\]]+)\]$/)
297 {
298 $infodb_key = $1;
299 }
300 elsif ($infodb_line =~ /^-{70}$/)
301 {
302 $infodb_map->{$infodb_key} = $infodb_value;
303 $infodb_key = "";
304 $infodb_value = "";
305 }
306 else
307 {
308 $infodb_value .= $infodb_line;
309 }
310 }
311
312 close (PIPEIN);
313}
314
315
316sub write_infodb_entry_gdbm
317{
318 # With infodb_handle already set up, works the same as _gdbm_txtgz version
319 write_infodb_entry_gdbm_txtgz(@_);
320}
321
322
323
324# -----------------------------------------------------------------------------
325# SQLITE IMPLEMENTATION
326# -----------------------------------------------------------------------------
327
328sub open_infodb_write_handle_sqlite
329{
330 my $infodb_file_path = shift(@_);
331
332 my $sqlite3_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "sqlite3" . &util::get_os_exe());
333 my $infodb_handle = undef;
334 if (!-e "$sqlite3_exe" || !open($infodb_handle, "| \"$sqlite3_exe\" \"$infodb_file_path\""))
335 {
336 return undef;
337 }
338
339 print $infodb_handle "CREATE TABLE IF NOT EXISTS data (key TEXT PRIMARY KEY, value TEXT);\n";
340 print $infodb_handle "CREATE TABLE IF NOT EXISTS document_metadata (id INTEGER PRIMARY KEY, docOID TEXT, element TEXT, value TEXT);\n";
341
342 # This is crucial for efficiency when importing large amounts of data
343 print $infodb_handle "CREATE INDEX IF NOT EXISTS dmd ON document_metadata(docOID);\n";
344
345 # This is very important for efficiency, otherwise each command will be actioned one at a time
346 print $infodb_handle "BEGIN TRANSACTION;\n";
347
348 return $infodb_handle;
349}
350
351
352
353sub close_infodb_write_handle_sqlite
354{
355 my $infodb_handle = shift(@_);
356
357 # Close the transaction we began after opening the file
358 print $infodb_handle "END TRANSACTION;\n";
359
360 # This is crucial for efficient queries on the database!
361 print $infodb_handle "CREATE INDEX IF NOT EXISTS dme ON document_metadata(element);\n";
362
363 close($infodb_handle);
364}
365
366
367sub get_infodb_file_path_sqlite
368{
369 my $collection_name = shift(@_);
370 my $infodb_directory_path = shift(@_);
371
372 my $infodb_file_extension = ".db";
373 my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
374 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
375}
376
377
378
379sub read_infodb_file_sqlite
380{
381 my $infodb_file_path = shift(@_);
382 my $infodb_map = shift(@_);
383
384 # !! TO IMPLEMENT
385}
386
387
388sub write_infodb_entry_sqlite
389{
390 my $infodb_handle = shift(@_);
391 my $infodb_key = shift(@_);
392 my $infodb_map = shift(@_);
393
394 # Add the key -> value mapping into the "data" table
395 my $infodb_entry_value = "";
396 foreach my $infodb_value_key (keys(%$infodb_map))
397 {
398 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
399 {
400 $infodb_entry_value .= "<$infodb_value_key>" . $infodb_value . "\n";
401 }
402 }
403
404 my $safe_infodb_key = &sqlite_safe($infodb_key);
405 print $infodb_handle "INSERT OR REPLACE INTO data (key, value) VALUES ('" . $safe_infodb_key . "', '" . &sqlite_safe($infodb_entry_value) . "');\n";
406
407 # If this infodb entry is for a document, add all the interesting document metadata to the
408 # "document_metadata" table (for use by the dynamic classifiers)
409 if ($infodb_key !~ /\./ && $infodb_entry_value =~ /\<doctype\>doc\n/)
410 {
411 print $infodb_handle "DELETE FROM document_metadata WHERE docOID='" . $safe_infodb_key . "';\n";
412
413 foreach my $infodb_value_key (keys(%$infodb_map))
414 {
415 # We're not interested in most of the automatically added document metadata
416 next if ($infodb_value_key eq "archivedir" ||
417 $infodb_value_key eq "assocfilepath" ||
418 $infodb_value_key eq "childtype" ||
419 $infodb_value_key eq "contains" ||
420 $infodb_value_key eq "docnum" ||
421 $infodb_value_key eq "doctype" ||
422 $infodb_value_key eq "Encoding" ||
423 $infodb_value_key eq "FileSize" ||
424 $infodb_value_key eq "hascover" ||
425 $infodb_value_key eq "hastxt" ||
426 $infodb_value_key eq "lastmodified" ||
427 $infodb_value_key eq "metadataset" ||
428 $infodb_value_key eq "thistype" ||
429 $infodb_value_key =~ /^metadatafreq\-/ ||
430 $infodb_value_key =~ /^metadatalist\-/);
431
432 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
433 {
434 print $infodb_handle "INSERT INTO document_metadata (docOID, element, value) VALUES ('" . $safe_infodb_key . "', '" . &sqlite_safe($infodb_value_key) . "', '" . &sqlite_safe($infodb_value) . "');\n";
435 }
436 }
437 }
438}
439
440
441sub sqlite_safe
442{
443 my $value = shift(@_);
444
445 # Escape any single quotes in the value
446 $value =~ s/\'/\'\'/g;
447
448 return $value;
449}
450
451
4521;
Note: See TracBrowser for help on using the repository browser.