source: gsdl/trunk/perllib/dbutil.pm@ 17104

Last change on this file since 17104 was 17104, checked in by mdewsnip, 16 years ago

Arrrgghhh, someone uglied up my nice tidy code...

File size: 12.4 KB
Line 
1###########################################################################
2#
3# dbutil.pm -- utility functions for writing to different databases
4# Copyright (C) 2008 DL Consulting Ltd
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package dbutil;
27
28use strict;
29
30
31sub open_infodb_write_handle
32{
33 my $infodb_type = shift(@_);
34 my $infodb_file_path = shift(@_);
35
36 if ($infodb_type eq "sqlite")
37 {
38 return &open_infodb_write_handle_sqlite($infodb_file_path);
39 }
40 elsif ($infodb_type eq "gdbm")
41 {
42 return &open_infodb_write_handle_gdbm($infodb_file_path);
43 }
44
45 # Use text (gzipped) version ready for conversion to GDBM
46 # if the infodb type is empty or not one of the values above
47 return &open_infodb_write_handle_gdbm_txtgz($infodb_file_path);
48}
49
50
51sub close_infodb_write_handle
52{
53 my $infodb_type = shift(@_);
54 my $infodb_handle = shift(@_);
55
56 if ($infodb_type eq "sqlite")
57 {
58 return &close_infodb_write_handle_sqlite($infodb_handle);
59 }
60 elsif ($infodb_type eq "gdbm")
61 {
62 return &close_infodb_write_handle_gdbm($infodb_handle);
63 }
64
65 # Use text (gzipped) version ready for conversion to GDBM
66 # if the infodb type is empty or not one of the values above return
67 &close_infodb_write_handle_gdbm_txtgz($infodb_handle);
68}
69
70
71sub get_default_infodb_type
72{
73 return "gdbm-txtgz";
74}
75
76
77sub get_infodb_file_path
78{
79 my $infodb_type = shift(@_);
80 my $collection_name = shift(@_);
81 my $infodb_directory_path = shift(@_);
82
83 if ($infodb_type eq "sqlite")
84 {
85 return &get_infodb_file_path_sqlite($collection_name, $infodb_directory_path);
86 }
87 elsif ($infodb_type eq "gdbm")
88 {
89 return &get_infodb_file_path_gdbm($collection_name, $infodb_directory_path);
90 }
91
92 # Use text (gzipped) version ready for conversion to GDBM
93 # if the infodb type is empty or not one of the values above return
94 return &get_infodb_file_path_gdbm_txtgz($collection_name, $infodb_directory_path);
95}
96
97
98sub read_infodb_file
99{
100 my $infodb_type = shift(@_);
101 my $infodb_file_path = shift(@_);
102 my $infodb_map = shift(@_);
103
104 if ($infodb_type eq "sqlite")
105 {
106 return &read_infodb_file_sqlite($infodb_file_path, $infodb_map);
107 }
108 elsif ($infodb_type eq "gdbm")
109 {
110 return &read_infodb_file_gdbm($infodb_file_path, $infodb_map);
111 }
112
113 # Use text (gzipped) version ready for conversion to GDBM
114 # if the infodb type is empty or not one of the values above return
115 return &read_infodb_file_gdbm_txtgz($infodb_file_path, $infodb_map);
116}
117
118
119sub write_infodb_entry
120{
121 my $infodb_type = shift(@_);
122 my $infodb_handle = shift(@_);
123 my $infodb_key = shift(@_);
124 my $infodb_map = shift(@_);
125
126 if ($infodb_type eq "sqlite")
127 {
128 return &write_infodb_entry_sqlite($infodb_handle, $infodb_key, $infodb_map);
129 }
130 elsif ($infodb_type eq "gdbm")
131 {
132 return &write_infodb_entry_gdbm($infodb_handle, $infodb_key, $infodb_map);
133 }
134
135 # Use text (gzipped) version ready for conversion to GDBM
136 # if the infodb type is empty or not one of the values above return
137 return &write_infodb_entry_gdbm_txtgz($infodb_handle, $infodb_key, $infodb_map);
138}
139
140
141
142# -----------------------------------------------------------------------------
143# GDBM TXT-GZ IMPLEMENTATION
144# -----------------------------------------------------------------------------
145
146sub open_infodb_write_handle_gdbm_txtgz
147{
148 # Keep infodb in GDBM neutral form => save data as compressed text file,
149 # read for txt2db to be run on it later (i.e. by the runtime system,
150 # first time the collection is ever accessed). This makes it easier
151 # distribute pre-built collections to various architectures.
152 #
153 # NB: even if two architectures are little endian (e.g. Intel and
154 # ARM procesors) GDBM does *not* guarantee that the database generated on
155 # one will work on the other
156
157 my $infodb_file_path = shift(@_);
158
159 # Greenstone ships with gzip for windows, on $PATH
160
161 my $infodb_file_handle = undef;
162 if (!open($infodb_file_handle, "| gzip - > \"$infodb_file_path\""))
163 {
164 return undef;
165 }
166
167 return $infodb_file_handle;
168}
169
170
171sub close_infodb_write_handle_gdbm_txtgz
172{
173 my $infodb_handle = shift(@_);
174
175 close($infodb_handle);
176}
177
178
179sub get_infodb_file_path_gdbm_txtgz
180{
181 my $collection_name = shift(@_);
182 my $infodb_directory_path = shift(@_);
183
184 my $infodb_file_name = &util::get_dirsep_tail($collection_name).".txt.gz";
185 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
186}
187
188
189sub read_infodb_file_gdbm_txtgz
190{
191 my $infodb_file_path = shift(@_);
192 my $infodb_map = shift(@_);
193
194 my $cmd = "gzip --decompress \"$infodb_file_path\"";
195
196 open (PIPEIN, "$cmd |")
197 || die "Error: Couldn't open pipe from gzip: $!\n $cmd\n";
198
199 my $infodb_line = "";
200 my $infodb_key = "";
201 my $infodb_value = "";
202 while (defined ($infodb_line = <PIPEIN>))
203 {
204 if ($infodb_line =~ /^\[([^\]]+)\]$/)
205 {
206 $infodb_key = $1;
207 }
208 elsif ($infodb_line =~ /^-{70}$/)
209 {
210 $infodb_map->{$infodb_key} = $infodb_value;
211 $infodb_key = "";
212 $infodb_value = "";
213 }
214 else
215 {
216 $infodb_value .= $infodb_line;
217 }
218 }
219
220 close (PIPEIN);
221}
222
223
224sub write_infodb_entry_gdbm_txtgz
225{
226
227 my $infodb_handle = shift(@_);
228 my $infodb_key = shift(@_);
229 my $infodb_map = shift(@_);
230
231 print $infodb_handle "[$infodb_key]\n";
232 foreach my $infodb_value_key (keys(%$infodb_map))
233 {
234 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
235 {
236 if ($infodb_value =~ /-{70,}/)
237 {
238 # if value contains 70 or more hyphens in a row we need to escape them
239 # to prevent txt2db from treating them as a separator
240 $infodb_value =~ s/-/&\#045;/gi;
241 }
242 print $infodb_handle "<$infodb_value_key>" . $infodb_value . "\n";
243 }
244 }
245 print $infodb_handle '-' x 70, "\n";
246}
247
248
249
250# -----------------------------------------------------------------------------
251# GDBM IMPLEMENTATION
252# -----------------------------------------------------------------------------
253
254sub open_infodb_write_handle_gdbm
255{
256 my $infodb_file_path = shift(@_);
257
258 my $txt2db_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "txt2db" . &util::get_os_exe());
259 my $infodb_file_handle = undef;
260 if (!-e "$txt2db_exe" || !open($infodb_file_handle, "| \"$txt2db_exe\" \"$infodb_file_path\""))
261 {
262 return undef;
263 }
264
265 return $infodb_file_handle;
266}
267
268sub close_infodb_write_handle_gdbm
269{
270 my $infodb_handle = shift(@_);
271
272 close($infodb_handle);
273}
274
275
276sub get_infodb_file_path_gdbm
277{
278 my $collection_name = shift(@_);
279 my $infodb_directory_path = shift(@_);
280
281 my $infodb_file_extension = (&util::is_little_endian() ? ".ldb" : ".bdb");
282 my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
283 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
284}
285
286
287sub read_infodb_file_gdbm
288{
289 my $infodb_file_path = shift(@_);
290 my $infodb_map = shift(@_);
291
292 open (PIPEIN, "db2txt \"$infodb_file_path\" |") || die "couldn't open pipe from db2txt\n";
293 my $infodb_line = "";
294 my $infodb_key = "";
295 my $infodb_value = "";
296 while (defined ($infodb_line = <PIPEIN>))
297 {
298 if ($infodb_line =~ /^\[([^\]]+)\]$/)
299 {
300 $infodb_key = $1;
301 }
302 elsif ($infodb_line =~ /^-{70}$/)
303 {
304 $infodb_map->{$infodb_key} = $infodb_value;
305 $infodb_key = "";
306 $infodb_value = "";
307 }
308 else
309 {
310 $infodb_value .= $infodb_line;
311 }
312 }
313
314 close (PIPEIN);
315}
316
317
318sub write_infodb_entry_gdbm
319{
320 # With infodb_handle already set up, works the same as _gdbm_txtgz version
321 write_infodb_entry_gdbm_txtgz(@_);
322}
323
324
325
326# -----------------------------------------------------------------------------
327# SQLITE IMPLEMENTATION
328# -----------------------------------------------------------------------------
329
330sub open_infodb_write_handle_sqlite
331{
332 my $infodb_file_path = shift(@_);
333
334 my $sqlite3_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",$ENV{'GSDLOS'}, "sqlite3" . &util::get_os_exe());
335 my $infodb_handle = undef;
336 if (!-e "$sqlite3_exe" || !open($infodb_handle, "| \"$sqlite3_exe\" \"$infodb_file_path\""))
337 {
338 return undef;
339 }
340
341 print $infodb_handle "CREATE TABLE IF NOT EXISTS data (key TEXT PRIMARY KEY, value TEXT);\n";
342 print $infodb_handle "CREATE TABLE IF NOT EXISTS document_metadata (id INTEGER PRIMARY KEY, docOID TEXT, element TEXT, value TEXT);\n";
343
344 # This is crucial for efficiency when importing large amounts of data
345 print $infodb_handle "CREATE INDEX IF NOT EXISTS dmd ON document_metadata(docOID);\n";
346
347 # This is very important for efficiency, otherwise each command will be actioned one at a time
348 print $infodb_handle "BEGIN TRANSACTION;\n";
349
350 return $infodb_handle;
351}
352
353
354sub close_infodb_write_handle_sqlite
355{
356 my $infodb_handle = shift(@_);
357
358 # Close the transaction we began after opening the file
359 print $infodb_handle "END TRANSACTION;\n";
360
361 # This is crucial for efficient queries on the database!
362 print $infodb_handle "CREATE INDEX IF NOT EXISTS dme ON document_metadata(element);\n";
363
364 close($infodb_handle);
365}
366
367
368sub get_infodb_file_path_sqlite
369{
370 my $collection_name = shift(@_);
371 my $infodb_directory_path = shift(@_);
372
373 my $infodb_file_extension = ".db";
374 my $infodb_file_name = &util::get_dirsep_tail($collection_name) . $infodb_file_extension;
375 return &util::filename_cat($infodb_directory_path, $infodb_file_name);
376}
377
378
379sub read_infodb_file_sqlite
380{
381 my $infodb_file_path = shift(@_);
382 my $infodb_map = shift(@_);
383
384 # !! TO IMPLEMENT
385}
386
387
388sub write_infodb_entry_sqlite
389{
390 my $infodb_handle = shift(@_);
391 my $infodb_key = shift(@_);
392 my $infodb_map = shift(@_);
393
394 # Add the key -> value mapping into the "data" table
395 my $infodb_entry_value = "";
396 foreach my $infodb_value_key (keys(%$infodb_map))
397 {
398 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
399 {
400 $infodb_entry_value .= "<$infodb_value_key>" . $infodb_value . "\n";
401 }
402 }
403
404 my $safe_infodb_key = &sqlite_safe($infodb_key);
405 print $infodb_handle "INSERT OR REPLACE INTO data (key, value) VALUES ('" . $safe_infodb_key . "', '" . &sqlite_safe($infodb_entry_value) . "');\n";
406
407 # If this infodb entry is for a document, add all the interesting document metadata to the
408 # "document_metadata" table (for use by the dynamic classifiers)
409 if ($infodb_key !~ /\./ && $infodb_entry_value =~ /\<doctype\>doc\n/)
410 {
411 print $infodb_handle "DELETE FROM document_metadata WHERE docOID='" . $safe_infodb_key . "';\n";
412
413 foreach my $infodb_value_key (keys(%$infodb_map))
414 {
415 # We're not interested in most of the automatically added document metadata
416 next if ($infodb_value_key eq "archivedir" ||
417 $infodb_value_key eq "assocfilepath" ||
418 $infodb_value_key eq "childtype" ||
419 $infodb_value_key eq "contains" ||
420 $infodb_value_key eq "docnum" ||
421 $infodb_value_key eq "doctype" ||
422 $infodb_value_key eq "Encoding" ||
423 $infodb_value_key eq "FileSize" ||
424 $infodb_value_key eq "hascover" ||
425 $infodb_value_key eq "hastxt" ||
426 $infodb_value_key eq "lastmodified" ||
427 $infodb_value_key eq "metadataset" ||
428 $infodb_value_key eq "thistype" ||
429 $infodb_value_key =~ /^metadatafreq\-/ ||
430 $infodb_value_key =~ /^metadatalist\-/);
431
432 foreach my $infodb_value (@{$infodb_map->{$infodb_value_key}})
433 {
434 print $infodb_handle "INSERT INTO document_metadata (docOID, element, value) VALUES ('" . $safe_infodb_key . "', '" . &sqlite_safe($infodb_value_key) . "', '" . &sqlite_safe($infodb_value) . "');\n";
435 }
436 }
437 }
438}
439
440
441sub sqlite_safe
442{
443 my $value = shift(@_);
444
445 # Escape any single quotes in the value
446 $value =~ s/\'/\'\'/g;
447
448 return $value;
449}
450
451
4521;
Note: See TracBrowser for help on using the repository browser.