root/main/trunk/greenstone2/perllib/gssql.pm @ 32555

Revision 32555, 20.1 KB (checked in by ak19, 21 months ago)

1. In GreenstoneSQLPlugout, removeold is now paramterised (as are keepold, incremental, incremental_mode). 2. Deletion on incremental_build works. But there are more questions. Why are there 4 passes? What to do on reindexing and when to do it (should it happen during GS SQL plugout or plugin)?

Line 
1###########################################################################
2#
3# gssql.pm -- DBI for SQL related utility functions used by
4# GreenstoneSQLPlugout and hereafter by GreenstoneSQLPlugin too.
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package gssql;
28
29use strict;
30no strict 'refs';
31no strict 'subs';
32
33use DBI; # the central package for this module used by GreenstoneSQL Plugout and Plugin
34
35# Need params_map keys:
36# - collection_name
37# - db_encoding (db content encoding) - MySQL can set this at server, db, table levels. For MySQL
38# we set the enc during connect at server level. Not sure whether other DB's support it at the
39# same levels.
40
41# For connection to MySQL, need:
42#  - db_driver, db_client_user, db_client_pwd, db_host, (db_port not used at present)
43# So these will be parameterised, but in a hashmap, for just the connect method.
44
45# Parameterise (one or more methods may use them):
46# - build_mode (like removeold)
47# - db_name (which is the GS3 sitename)
48
49# TODO: add infrastructure for db_port, AutoCommit etc
50# For port, see https://stackoverflow.com/questions/2248665/perl-script-to-connect-to-mysql-server-port-3307
51
52sub new
53{
54 
55    my $class = shift(@_);
56   
57    my ($params_map) = @_;
58   
59    # library_url: to be specified on the cmdline if not using a GS-included web server
60    # the GSDL_LIBRARY_URL env var is useful when running cmdline buildcol.pl in the linux package manager versions of GS3
61   
62    # https://stackoverflow.com/questions/7083453/copying-a-hashref-in-perl
63    # Making a shallow copy works, and can handle unknown params:
64    #my $self = $params_map;
65
66    # but being explicit for class params needed for MySQL:
67    my $self = {
68    'collection_name' => $params_map->{'collection_name'},
69    'db_encoding' => $params_map->{'db_encoding'}
70    };
71
72    # (My)SQL doesn't like tables with - (hyphens) in their names
73    my $coll_name = $params_map->{'collection_name'};
74    $coll_name =~ s/-/_/g;
75    $self->{'tablename_prefix'} = $coll_name;
76
77    return bless($self, $class);
78}
79
80
81#################################
82
83# Database access related functions
84# http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm
85# https://www.guru99.com/insert-into.html
86
87# TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)?
88# Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current?
89# https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data
90# BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying?
91# The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back.
92# Unless they do a full rebuild, which will recreate the table from scratch?
93# SOLUTION-> rollback transaction on error, see https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
94# But then should set AutoCommit to off on connection, and remember to commit every time
95
96#################
97# Database functions that use the perl DBI module (with the DBD driver module for mysql)
98#################
99
100################### BASIC DB OPERATIONS ##################
101
102# THE NEW DB FUNCTIONS
103# NOTE: FULLTEXT is a reserved keyword in (My)SQL. So we can't name a table or any of its columns "fulltext".
104# https://dev.mysql.com/doc/refman/5.5/en/keywords.html
105
106# TODO: Consider AutoCommit status (and Autocommit off allowing commit or rollback for GS coll build cancel) later
107
108# TODO: where should the defaults for these params be, here or in GS-SQLPlugin/Plugout?
109sub connect_to_db {
110    my $self= shift (@_);
111    my ($params_map) = @_;
112    my $db_enc = $self->{'db_encoding'} || "utf8";
113
114    # these are the params for connecting to MySQL
115    my $db_driver = $params_map->{'db_driver'} || "mysql";
116    my $db_user = $params_map->{'db_client_user'} || "root";
117    my $db_pwd = $params_map->{'db_client_pwd'}; # even if undef, we'll see a sensible error message
118                                           # when connect fails
119    my $db_host = $params_map->{'db_host'} || "127.0.0.1";
120    # localhost doesn't work for us, but 127.0.0.1 works
121    # https://metacpan.org/pod/DBD::mysql
122    # "The hostname, if not specified or specified as '' or 'localhost', will default to a MySQL server
123    # running on the local machine using the default for the UNIX socket. To connect to a MySQL server
124    # on the local machine via TCP, you must specify the loopback IP address (127.0.0.1) as the host."
125    #my $connect_str = "dbi:$db_driver:database=$db_name;host=$db_host";
126    my $connect_str = "dbi:$db_driver:host=$db_host"; # don't provide db - allows checking the db exists
127    my $dbh = DBI->connect("$connect_str", $db_user, $db_pwd,
128               {
129                   ShowErrorStatement => 1, # more informative as DBI will append failed SQL stmt to error message
130                   PrintError => 1, # on by default, but being explicit
131                   RaiseError => 0, # off by default, but being explicit
132                   AutoCommit => 1 # on by default, but being explicit
133               });
134
135    if(!$dbh) {
136    # NOTE, despite handle dbh being undefined, error code will be in DBI->err
137    return 0;   
138    }
139
140    # set encoding https://metacpan.org/pod/DBD::mysql
141    # https://dev.mysql.com/doc/refman/5.7/en/charset.html
142    # https://dev.mysql.com/doc/refman/5.7/en/charset-conversion.html
143    # Setting the encoding at db server level.
144    # Not sure if this command is mysql specific:
145    my $stmt = "set NAMES '" . $db_enc . "'";
146    $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n");
147   
148    # if we're here, then connection succeeded, store handle
149    $self->{'db_handle'} = $dbh;
150    return 1;
151}
152
153# will attempt to load the specified db and the <coll>_metadata and <coll>_fulltxt for this
154# collection, or create any of these (db, tables) that don't yet exist. At the end
155# it will have loaded the requested database (in MySQL: "use <db>;")
156sub load_db_and_tables {
157    my $self= shift (@_);
158    my ($db_name, $build_mode) = @_;
159    my $dbh = $self->{'db_handle'};
160   
161    # perl DBI switch database: https://www.perlmonks.org/?node_id=995434
162    # do() returns undef on error.
163    # connection succeeded, try to load our database. If that didn't work, attempt to create db
164    my $success = $dbh->do("use $db_name");
165   
166    if(!$success && $dbh->err == 1049) { # "Unknown database" error has code 1049 (mysql only?) meaning db doesn't exist yet
167    # attempt to create the db and its tables
168    $self->create_db($db_name) || return 0;
169
170    print STDERR "@@@ CREATED DATABASE $db_name\n";
171   
172    # once more attempt to use db, now that it exists
173    $dbh->do("use $db_name") || return 0;
174    #$dbh->do("use localsite") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n";
175
176    # attempt to create tables in current db
177    $self->create_metadata_table() || return 0;
178    $self->create_fulltext_table() || return 0;
179
180    $success = 1;
181    }
182    elsif($success) { # database existed and loaded successfully, but
183    # before proceeding check that the current collection's tables exist
184
185    print STDERR "@@@ DATABASE $db_name EXISTED\n";
186   
187
188    # build_mode can be removeold or incremental. We only do something special on removeold:
189    # deleting the existing tables for this collection and recreating empty ones
190    if($build_mode eq "removeold") {
191        $self->delete_collection_tables();
192    }
193
194    # use existing tables if any
195    # attempt to create tables in current db   
196    if($build_mode eq "removeold" || !$self->table_exists($self->get_metadata_table_name())) {
197        $self->create_metadata_table() || return 0;
198    } else {
199        print STDERR "@@@ Meta table exists\n";
200    }
201    if($build_mode eq "removeold" || !$self->table_exists($self->get_fulltext_table_name())) {
202        $self->create_fulltext_table() || return 0;
203    } else {
204        print STDERR "@@@ Fulltxt table exists\n";
205    }
206   
207    }
208   
209    return $success; # could still return 0, if database failed to load with an error code != 1049
210}
211
212# GreenstoneSQLPlugin calls this method to load an existing db.
213# This will terminate if the db does not exist. Unlike load_db_and_tables() above, used by
214# GreenstoneSQLPlugout, this method will not attempt to create the requested db (nor its tables)
215# TODO: GS SQLPlugin is called before GS SQLPlugout and attempts to use_db() - called in plugin's
216# init() method. This will fail if the db does not exist. Ideally want the gssqlplugin only called
217# during buildcol.pl
218sub use_db {
219    my $self= shift (@_);
220    my ($db_name) = @_;
221    my $dbh = $self->{'db_handle'};
222   
223    # perl DBI switch database: https://www.perlmonks.org/?node_id=995434
224    # do() returns undef on error.
225    # connection succeeded, try to load our database. If that didn't work, attempt to create db
226    return $dbh->do("use $db_name") || warn();
227}
228
229# disconnect from db - https://metacpan.org/pod/DBI#disconnect
230# TODO: make sure to have committed or rolled back before disconnect
231# and that you've call finish() on statement handles if any fetch remnants remain
232sub disconnect_from_db {
233    my $self= shift (@_);   
234    my $dbh = $self->{'db_handle'};
235
236    # make sure any active stmt handles are finished
237    # NO: "When all the data has been fetched from a SELECT statement, the driver will automatically call finish for you. So you should not call it explicitly except when you know that you've not fetched all the data from a statement handle and the handle won't be destroyed soon."
238   
239    #$meta_sth = $self->{'metadata_prepared_insert_statement_handle'};
240    #$txt_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
241    #$meta_sth->finish() if($meta_sth);
242    #$txt_sth->finish() if($txt_sth);
243   
244    my $rc = $dbh->disconnect or warn $dbh->errstr; # The handle is of little use after disconnecting. Possibly PrintError already prints a warning and this duplicates it?
245    return $rc;
246}
247
248sub create_db {
249    my $self= shift (@_);
250    my $db_name = $self->{'db_name'};
251    my $dbh = $self->{'db_handle'};
252   
253    # https://stackoverflow.com/questions/5025768/how-can-i-create-a-mysql-database-from-a-perl-script
254    return $dbh->do("create database $db_name"); # do() will return undef on fail, https://metacpan.org/pod/DBI#do
255}
256
257
258sub create_metadata_table {
259    my $self= shift (@_);
260    my $dbh = $self->{'db_handle'};
261   
262    my $table_name = $self->get_metadata_table_name();
263
264    # If using an auto incremented primary key:
265    my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, metaname VARCHAR(127) NOT NULL, metavalue VARCHAR(1023) NOT NULL, PRIMARY KEY(id));";
266    return $dbh->do($stmt);
267}
268
269# TODO: Investigate: https://dev.mysql.com/doc/search/?d=10&p=1&q=FULLTEXT
270# 12.9.1 Natural Language Full-Text Searches
271# to see whether we have to index the 'fulltxt' column of the 'fulltext' tables
272# or let user edit this file, or add it as another option
273sub create_fulltext_table {
274    my $self= shift (@_);
275    my $dbh = $self->{'db_handle'};
276   
277    my $table_name = $self->get_fulltext_table_name();
278
279    # If using an auto incremented primary key:
280    my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, fulltxt LONGTEXT, PRIMARY KEY(id));";
281    return $dbh->do($stmt);
282
283}
284
285# "IF EXISTS is used to prevent an error from occurring if the database does not exist. ... DROP DATABASE returns the number of tables that were removed. The DROP DATABASE statement removes from the given database directory those files and directories that MySQL itself may create during normal operation.Jun 20, 2012"
286# MySQL 8.0 Reference Manual :: 13.1.22 DROP DATABASE Syntax
287# https://dev.mysql.com/doc/en/drop-database.html
288sub delete_collection_tables {
289    my $self= shift (@_);
290    my $dbh = $self->{'db_handle'};
291   
292    print STDERR "### Build mode is removeold, so deleting tables for current collection\n";
293   
294    # drop table <tablename>
295    my $table = $self->get_metadata_table_name();
296    $dbh->do("drop table $table") || warn("@@@ Couldn't delete $table");
297    $table = $self->get_fulltext_table_name();
298    $dbh->do("drop table $table") || warn("@@@ Couldn't delete $table");
299}
300
301# Don't call this: it will delete the meta and full text tables for ALL collections in $db_name (localsite by default)!
302# This method is just here for debugging (for testing creating a database when there is none)
303sub _delete_database {
304    my $self= shift (@_);
305    my ($db_name) = @_;
306    my $dbh = $self->{'db_handle'};
307   
308    # "drop database dbname"
309    $dbh->do("drop database $db_name") || return 0;
310
311    return 1;
312}
313
314
315########################### DB STATEMENTS ###########################
316
317# USEFUL: https://metacpan.org/pod/DBI
318# "Many methods have an optional \%attr parameter which can be used to pass information to the driver implementing the method. Except where specifically documented, the \%attr parameter can only be used to pass driver specific hints. In general, you can ignore \%attr parameters or pass it as undef."
319
320
321# https://www.guru99.com/insert-into.html
322# and https://dev.mysql.com/doc/refman/8.0/en/example-auto-increment.html
323#     for inserting multiple rows at once
324# https://www.perlmonks.org/bare/?node_id=316183
325# https://metacpan.org/pod/DBI#do
326# https://www.quora.com/What-is-the-difference-between-prepare-and-do-statements-in-Perl-while-we-make-a-connection-to-the-database-for-executing-the-query
327# https://docstore.mik.ua/orelly/linux/dbi/ch05_05.htm
328
329# https://metacpan.org/pod/DBI#performance
330# 'The q{...} style quoting used in this example avoids clashing with quotes that may be used in the SQL statement. Use the double-quote like qq{...} operator if you want to interpolate variables into the string. See "Quote and Quote-like Operators" in perlop for more details.'
331sub prepare_insert_metadata_row_stmthandle {
332    my $self = shift (@_);   
333    #my ($did, $sid, $metaname, $metavalue) = @_;
334    my $dbh = $self->{'db_handle'};
335   
336    my $tablename = $self->get_metadata_table_name();
337
338    #my $stmt = "INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES ('$did', '$sid', '$metaname', '$metavalue');"; # ?, ?, ?, ?
339
340    # using qq{} since we want $tablename placeholder to be filled in
341    # returns Statement Handle object!
342    my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)}) || warn("Could not prepare insert statement for metadata table\n");
343
344    print STDERR "@@@@ Prepared meta insert statement: ".$sth->{'Statement'}."\n";
345   
346    return $sth;
347}
348
349sub prepare_insert_fulltxt_row_stmthandle {
350    my $self = shift (@_);
351    #my ($did, $sid, $fulltext) = @_;
352    my $dbh = $self->{'db_handle'};
353   
354    my $tablename = $self->get_fulltext_table_name();
355
356    #my $stmt = "INSERT INTO $tablename (did, sid, fulltxt) VALUES ('$did', '$sid', '$fulltext');"; ?, ?, ?
357
358    # using qq{} since we want $tablename placeholder to be filled in
359    # returns Statement Handle object!
360    my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)}) || warn("Could not prepare insert statement for fulltxt table\n");
361   
362    print STDERR "@@@@ Prepared fulltext insert statement: ".$sth->{'Statement'}."\n";
363   
364    return $sth;
365}
366
367
368## The 2 select statements used by GreenstoneSQLPlugin
369
370# Returns the statement handle that prepared and executed
371# a "SELECT * FROM <COLL>_metadata WHERE did = $oid" SQL statement.
372# Caller can call fetchrow_array() on returned statement handle, $sth
373# Have to use prepare() and execute() instead of do() since do() does
374# not allow for fetching result set thereafter:
375# do(): "This method  is typically most useful for non-SELECT statements that either cannot be prepared in advance (due to a limitation of the driver) or do not need to be executed repeatedly. It should not be used for SELECT statements because it does not return a statement handle (so you can't fetch any data)." https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#do
376sub select_from_metatable_matching_docid {
377    my $self= shift (@_);
378    my ($oid) = @_;
379   
380    my $dbh = $self->{'db_handle'};
381    my $meta_table = $self->get_metadata_table_name();
382   
383    my $sth = $dbh->prepare(qq{SELECT * FROM $meta_table WHERE did = ?});
384    $sth->execute( $oid ); # will print msg on fail
385   
386    return $sth; # caller can call fetchrow_array() on returned statement handle, sth
387}
388
389# Returns the statement handle that prepared and executed
390# a "SELECT * FROM <COLL>_metadata WHERE did = $oid" SQL statement.
391# Caller can call fetchrow_array() on returned statement handle, $sth
392sub select_from_texttable_matching_docid {
393    my $self= shift (@_);
394    my ($oid) = @_;
395   
396    my $dbh = $self->{'db_handle'};
397    my $fulltxt_table = $self->get_fulltext_table_name();
398   
399    my $sth = $dbh->prepare(qq{SELECT * FROM $fulltxt_table WHERE did = ?});
400    $sth->execute( $oid ); # will print msg on fail
401   
402    return $sth; # caller can call fetchrow_array() on returned statement handle, sth
403}
404
405# delete all records in metatable with specified docid
406# https://www.tutorialspoint.com/mysql/mysql-delete-query.htm
407# DELETE FROM table_name [WHERE Clause]
408# see example under 'do' at https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm
409sub delete_recs_from_metatable_with_docid {
410    my $self= shift (@_);
411    my ($oid) = @_;
412   
413    my $dbh = $self->{'db_handle'};
414    my $meta_table = $self->get_metadata_table_name();
415   
416    #my $rows_deleted =
417    $dbh->do(qq{DELETE FROM $meta_table WHERE did = ?}, undef, $oid)
418    or warn $dbh->errstr;
419}
420
421# delete all records in metatable with specified docid
422sub delete_recs_from_texttable_with_docid {
423    my $self= shift (@_);
424    my ($oid) = @_;
425   
426    my $dbh = $self->{'db_handle'};   
427    my $fulltxt_table = $self->get_fulltext_table_name();
428   
429    $dbh->do(qq{DELETE FROM $fulltxt_table WHERE did = ?}, undef, $oid)
430    or warn $dbh->errstr;
431}
432
433# Can call this after connection succeeded to get the database handle, dbh,
434# if any specific DB operation (SQL statement, create/delete)
435# needs to be executed that is not already provided as a method of this class.
436sub get_db_handle {
437    my $self= shift (@_);
438    return $self->{'db_handle'};
439}
440
441################ HELPER METHODS ##############
442
443# More basic helper methods
444sub get_metadata_table_name {
445    my $self= shift (@_);
446    my $table_name = $self->{'tablename_prefix'} . "_metadata";
447    return $table_name;
448}
449
450# FULLTEXT is a reserved keyword in (My)SQL. https://dev.mysql.com/doc/refman/5.5/en/keywords.html
451# So we can't name a table or any of its columns "fulltext". We use "fulltxt" instead.
452sub get_fulltext_table_name {
453    my $self= shift (@_);
454    my $table_name = $self->{'tablename_prefix'} . "_fulltxt";
455    return $table_name;
456}
457
458
459# I can get my version of table_exists to work, but it's not so ideal
460# Interesting that MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS,
461# see https://www.perlmonks.org/bare/?node=DBI%20Recipes
462#    The page further has a table_exists function that could work with proper comparison
463# TODO Q: Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though
464sub table_exists {
465    my $self = shift (@_);
466    my $dbh = $self->{'db_handle'};
467    my ($table_name) = @_;
468
469    my @table_list = $dbh->tables;
470    #my $tables_str = @table_list[0];
471    foreach my $table (@table_list) {
472    return 1 if ($table =~ m/$table_name/);
473    }
474    return 0;
475}
476
4771;
Note: See TracBrowser for help on using the browser.