source: main/trunk/greenstone2/perllib/plugouts/MySQLPlugout.pm@ 32523

Last change on this file since 32523 was 32523, checked in by ak19, 6 years ago

UNDOING PART OF RECENT COMMIT: Calling superclass GreenstoneXMLPlugout's begin(), end(), close_group_output(), pre_saveas() and post_saveas() only if MySQLPlugout's process mode is not set to output ALL (meta and text) to sql db. But they should be called at all times even when process mode is set to output all, because assoc files must be written out and breadcrumbs must be written to doc.xml.

File size: 21.6 KB
Line 
1###########################################################################
2#
3# GreenstoneXMLPlugout.pm -- the plugout module for Greenstone Archives
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MySQLPlugout;
27
28use strict;
29no strict 'refs';
30no strict 'subs';
31
32use util;
33use FileUtils;
34use GreenstoneXMLPlugout;
35use docprint;
36
37use IPC::Open2;
38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
39
40# TODO: SIGTERM rollback and disconnect?
41
42
43# this plugout does not output xml to a file, but outputs rows into a mysql table
44sub BEGIN {
45 @MySQLPlugout::ISA = ('GreenstoneXMLPlugout');
46}
47
48
49# TODO: deal with -removeold and everything? Or type out instructions for user
50
51# TODO Q: what is "group" in GreenstoneXMLPlugout?
52# TODO Q: site_name only exists for GS3. What about GS2?
53
54my $process_mode_list =
55 [ { 'name' => "meta_only",
56 'desc' => "{MySQLPlugout.process_mode.meta_only}" },
57 { 'name' => "text_only",
58 'desc' => "{MySQLPlugout.process_mode.text_only}" },
59 { 'name' => "all",
60 'desc' => "{MySQLPlugout.process_mode.all}" } ];
61
62my $arguments = [
63 { 'name' => "process_mode",
64 'desc' => "{MySQLPlugout.process_mode}",
65 'type' => "enum",
66 'list' => $process_mode_list,
67 'deft' => "all",
68 'reqd' => "no",
69 'hiddengli' => "no"} ];
70
71my $options = { 'name' => "MySQLPlugout",
72 'desc' => "{MySQLPlugout.desc}",
73 'abstract' => "no",
74 'inherits' => "yes",
75 'args' => $arguments };
76
77sub new {
78 my ($class) = shift (@_);
79 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
80 push(@$plugoutlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}},$options);
84
85 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
86
87 if ($self->{'info_only'}) {
88 # don't worry about any options etc
89 return bless $self, $class;
90 }
91 print STDERR "***** MySQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
92
93 return bless $self, $class;
94}
95
96# connect here and ensure all tables and databases exist
97sub begin {
98
99 my $self= shift (@_);
100
101 ########### TODO: these should be set from cmdline/GLI options to plugout #########
102 $self->{'db_driver'} = "mysql";
103 $self->{'site_name'} = "localsite";
104 $self->{'db_client_user'} = "root";
105 $self->{'db_client_pwd'} = "6reenstone3";
106 #$self->{'db_host'} = "127.0.0.1";
107 #$self->{'db_encoding'} = "utf8";
108 #TODO: proc_mode is also a saveas option
109
110 ############ LOAD NECESSARY OPTIONS ###########
111 print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
112 $self->{'collection_name'} = $ENV{'GSDLCOLLECTION'};
113
114 if(!$self->connect_to_db()) {
115 # This is fatal for the plugout, let's terminate here
116 # PrintError would already have displayed the warning message on connection fail
117 die("Could not connect to db. Can't proceed.\n");
118 }
119 if(!$self->load_db_and_tables()) {
120 # This is fatal for the plugout, let's terminate here
121 # PrintError would already have displayed the warning message on connection fail
122 die("Could not use db or prepare its tables. Can't proceed.\n");
123 }
124
125 # prepare the shared/common HANDLES to SQL insert statements that contain placeholders
126 # and which we will reuse repeatedly when actually executing the insert statements
127 my $proc_mode = $self->{'process_mode'};
128 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
129 $self->{'metadata_prepared_insert_statement_handle'} = $self->prepare_insert_metadata_row_stmthandle();
130 }
131 if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
132 $self->{'fulltxt_prepared_insert_statement_handle'} = $self->prepare_insert_fulltxt_row_stmthandle();
133 }
134
135 # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
136 # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
137 # finally, can call begin on super - important as doc.xml is opened as a group etc
138
139 $self->GreenstoneXMLPlugout::begin(@_);
140}
141
142# disconnect from database here, see inexport.pm
143sub end
144{
145 my $self = shift(@_);
146
147 # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
148 $self->GreenstoneXMLPlugout::end(@_);
149
150 $self->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer
151}
152
153
154# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed
155sub saveas {
156 my $self = shift (@_);
157 my ($doc_obj, $doc_dir) = @_;
158
159 my $proc_mode = $self->{'process_mode'};
160
161 # 1. pre save out and saving debug handle
162
163 # must call superclass (pre/post) saveas methods, as they handle assoc_files too
164 my ($docxml_outhandler, $output_file) = $self->GreenstoneXMLPlugout::pre_saveas(@_);
165
166 $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
167
168 # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
169
170 # TODO: remove unused old_unused_saveas from GreenstoneXMLPlugout
171
172
173 # 2. overriding saving behaviour to do what the superclass does PLUS saving to sql db
174
175 #NOTE: if proc_mode == all, then "breadcrumbs" go into both meta and txt elements of doc.xml:
176 # statements pointing viewer to the sql db for contents
177
178 # write the INVERSE into doc.xml as to what is written to the db
179 my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
180 if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
181 $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
182 } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
183 $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
184 }
185
186 # now we've prepared to write out whatever is meant to go into docxml
187 # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
188 # So: write out the doc xml file for the current document
189 my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
190 print $docxml_outhandler $section_text;
191
192
193 # We also write out whatever needs to go into the MySQL database
194 $self->write_meta_and_text($doc_obj);
195
196
197 # 3. post save out
198 #$self->GreenstoneXMLPlugout::post_saveas(@_);
199 $self->GreenstoneXMLPlugout::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
200
201
202 # database connection is closed in end() method
203 # so we don't open and close over and over for each doc during a single build
204}
205
206
207# write meta and/or text PER DOC out to DB
208sub write_meta_and_text {
209 my $self = shift (@_);
210 my ($doc_obj) = @_;
211 my $root_section = $doc_obj->get_top_section();
212 my $doc_oid = $doc_obj->get_OID(); # we're processing a single doc at a time, so single OID
213
214 # load the prepared INSERT statement handles for both tables (can be undef for any table depending on whether meta_only or txt_only are set)
215 my $metadata_table_sth = $self->{'metadata_prepared_insert_statement_handle'};
216 my $fulltxt_table_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
217
218 $self->recursive_write_meta_and_text($doc_obj, $root_section, $metadata_table_sth, $fulltxt_table_sth);
219}
220
221# Perl: Reading or Writing to Another Program
222# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
223sub recursive_write_meta_and_text {
224 my $self = shift (@_);
225 my ($doc_obj, $doc_oid, $section, $metadata_table_sth, $fulltxt_table_sth) = @_;
226
227 # If section=ROOT, write "root" as section name into table
228 # doc->get_top_section() is the name of the doc root section, which is ""
229 my $section_name = ($section eq "") ? "root" : $section;
230
231 my $section_ptr = $doc_obj->_lookup_section ($section);
232 return "" unless defined $section_ptr;
233
234 my $debug_out = $self->{'debug_outhandle'};
235
236 #my $proc_mode = $self->{'process_mode'};
237 #if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
238 if($metadata_table_sth) { # meta insert statement handle will be undef if not writing meta
239
240 foreach my $data (@{$section_ptr->{'metadata'}}) {
241 my $meta_name = $data->[0];
242 my $escaped_meta_value = &escape_text($data->[1]);
243
244 # Write out the current section's meta to collection db's METADATA table
245
246 # for each set of values to write to meta table, execute the prepared statement, filling in the values
247
248 if($self->{'debug'}) {
249 # just print the statement we were going to execute
250
251 print $debug_out $metadata_table_sth->Statement . "($doc_oid, $section_name, $meta_name, $escaped_meta_value)\n";
252 }
253 else {
254
255 $metadata_table_sth->execute($doc_oid, $section_name, $meta_name, $escaped_meta_value);
256 #|| warn ("Unable to write metadata row to db:\n\tOID $doc_oid, section $section_name,\n\tmeta name: $meta_name, val: $escaped_meta_value");
257 # Execution failure will print out info anyway: since db connection sets PrintError
258 }
259 }
260 }
261
262 #if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
263 if($fulltxt_table_sth) { # fulltxt insert statement handle will be undef if not writing fulltxt
264
265 if($self->{'debug'}) {
266 # just print the statement we were going to execute, minus the fulltxt value
267 my $txt_repr = $section_ptr->{'text'} ? "<TXT>" : "NULL";
268 print $debug_out $fulltxt_table_sth->Statement . "($doc_oid, $section_name, $txt_repr)\n";
269 } else {
270 my $section_text = &escape_text($section_ptr->{'text'});
271
272 # fulltxt column can be SQL NULL. undef value gets written out as NULL:
273 # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
274
275 # Write out the current section's text to collection db's FULLTeXT table
276 $fulltxt_table_sth->execute($doc_oid, $section_name, $section_text);
277 #|| warn ("Unable to write fulltxt row to db for row:\n\tOID $doc_oid, section $section_name");
278 # Execution failure will print out info anyway: since db connection sets PrintError
279 }
280 }
281
282 # output all subsections: RECURSIVE CALL
283 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
284 &recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection", $metadata_table_sth, $fulltxt_table_sth);
285 }
286}
287
288#################################
289
290# Database access related functions
291# http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm
292# https://www.guru99.com/insert-into.html
293
294# TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)?
295# Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current?
296# https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data
297# BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying?
298# The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back.
299# Unless they do a full rebuild, which will recreate the table from scratch?
300# SOLUTION-> rollback transaction on error, see https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
301# But then should set AutoCommit to off on connection, and remember to commit every time
302
303#################
304# Database functions that use the perl DBI module (with the DBD driver module for mysql)
305#################
306
307# THE NEW DB FUNCTIONS
308# NOTE: FULLTEXT is a reserved keyword in (My)SQL. So we can't name a table or any of its columns "fulltext".
309# https://dev.mysql.com/doc/refman/5.5/en/keywords.html
310
311# TODO: Consider AutoCommit status (and Autocommit off allowing commit or rollback for GS coll build cancel) later
312
313sub connect_to_db {
314 my $self= shift (@_);
315
316 my $db_driver = $self->{'db_driver'};
317 my $db_user = $self->{'db_client_user'} || "root";
318 my $db_pwd = $self->{'db_client_pwd'};
319 my $db_host = $self->{'db_host'} || "127.0.0.1";
320 my $db_enc = $self->{'db_encoding'} || "utf8";
321
322 #my $db_name = $self->{'site_name'};
323
324 # try connecting to the mysql db, if that fails it will die
325 # so don't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
326
327 # localhost doesn't work for us, but 127.0.0.1 works
328 # https://metacpan.org/pod/DBD::mysql
329 # "The hostname, if not specified or specified as '' or 'localhost', will default to a MySQL server
330 # running on the local machine using the default for the UNIX socket. To connect to a MySQL server
331 # on the local machine via TCP, you must specify the loopback IP address (127.0.0.1) as the host."
332 #my $connect_str = "dbi:$db_driver:database=$db_name;host=$db_host";
333 my $connect_str = "dbi:$db_driver:host=$db_host"; # don't provide db, so we can check the db is there
334 my $dbh = DBI->connect("$connect_str", $db_user, $db_pwd,
335 {
336 ShowErrorStatement => 1, # more informative as DBI will append failed SQL stmt to error message
337 PrintError => 1, # on by default, but being explicit
338 RaiseError => 0, # off by default, but being explicit
339 AutoCommit => 1, # on by default, but being explicit
340 });
341
342 if(!$dbh) {
343 # NOTE, despite handle dbh being undefined, error code will be in DBI->err
344 return 0;
345 }
346
347 # set encoding https://metacpan.org/pod/DBD::mysql
348 # https://dev.mysql.com/doc/refman/5.7/en/charset.html
349 # https://dev.mysql.com/doc/refman/5.7/en/charset-conversion.html
350 # Setting the encoding at db server level.
351 # Not sure if this command is mysql specific:
352 my $stmt = "set NAMES '" . $db_enc . "'";
353 $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n");
354
355 # if we're here, then connection succeeded, store handle
356 $self->{'db_handle'} = $dbh;
357 return 1;
358}
359
360sub load_db_and_tables {
361 my $self= shift (@_);
362 my $db_name = $self->{'site_name'}; # one database per GS3 site
363 my $dbh = $self->{'db_handle'};
364
365 # perl DBI switch database: https://www.perlmonks.org/?node_id=995434
366 # do() returns undef on error.
367 # connection succeeded, try to load our database. If that didn't work, attempt to create db
368 my $success = $dbh->do("use $db_name");
369
370 if(!$success && $dbh->err == 1049) { # "Unknown database" error has code 1049 (mysql only?) meaning db doesn't exist yet
371 # attempt to create the db and its tables
372 $self->create_db($db_name) || return 0;
373
374 # once more attempt to use db, now that it exists
375 $dbh->do("use $db_name") || return 0;
376 #$dbh->do("use localsite") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n";
377
378 # attempt to create tables in current db
379 $self->create_metadata_table() || return 0;
380 $self->create_fulltext_table() || return 0;
381
382 $success = 1;
383 }
384 elsif($success) { # database existed and loaded successfully, but
385 # before proceeding check that the current collection's tables exist
386
387 # attempt to create tables in current db
388 if(!$self->table_exists($self->{'collection_name'} . "metadata")) {
389 $self->create_metadata_table() || return 0;
390 }
391 if(!$self->table_exists($self->{'collection_name'} . "fulltxt")) {
392 $self->create_fulltext_table() || return 0;
393 }
394 }
395
396 return $success; # could still return 0, if database failed to load with an error code != 1049
397}
398
399# disconnect from db - https://metacpan.org/pod/DBI#disconnect
400# TODO: make sure to have committed or rolled back before disconnect
401# and that you've call finish() on statement handles if any fetch remnants remain
402sub disconnect_from_db {
403 my $self= shift (@_);
404 my $dbh = $self->{'db_handle'};
405
406 # make sure any active stmt handles are finished
407 # NO: "When all the data has been fetched from a SELECT statement, the driver will automatically call finish for you. So you should not call it explicitly except when you know that you've not fetched all the data from a statement handle and the handle won't be destroyed soon."
408
409 #$meta_sth = $self->{'metadata_prepared_insert_statement_handle'};
410 #$txt_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
411 #$meta_sth->finish() if($meta_sth);
412 #$txt_sth->finish() if($txt_sth);
413
414 my $rc = $dbh->disconnect or warn $dbh->errstr; # The handle is of little use after disconnecting. Possibly PrintError already prints a warning and this duplicates it?
415 return $rc;
416}
417
418sub create_db {
419 my $self= shift (@_);
420 my $db_name = $self->{'site_name'};
421 my $dbh = $self->{'db_handle'};
422
423 # https://stackoverflow.com/questions/5025768/how-can-i-create-a-mysql-database-from-a-perl-script
424 return $dbh->do("create database $db_name"); # do() will return undef on fail, https://metacpan.org/pod/DBI#do
425}
426
427sub create_metadata_table {
428 my $self= shift (@_);
429 my $dbh = $self->{'db_handle'};
430
431 my $table_name = $self->{'collection_name'} . "metadata";
432
433 # If using an auto incremented primary key:
434 my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, metaname VARCHAR(127) NOT NULL, metavalue VARCHAR(1023) NOT NULL, PRIMARY KEY(id));";
435 return $dbh->do($stmt);
436}
437
438# TODO: Investigate: https://dev.mysql.com/doc/search/?d=10&p=1&q=FULLTEXT
439# 12.9.1 Natural Language Full-Text Searches
440# to see whether we have to index the 'fulltxt' column of the 'fulltext' tables
441# or let user edit this file, or add it as another option
442sub create_fulltext_table {
443 my $self= shift (@_);
444 my $dbh = $self->{'db_handle'};
445
446 my $table_name = $self->{'collection_name'} . "fulltxt";
447
448 # If using an auto incremented primary key:
449 my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, fulltxt LONGTEXT, PRIMARY KEY(id));";
450 return $dbh->do($stmt);
451
452}
453
454
455# USEFUL: https://metacpan.org/pod/DBI
456# "Many methods have an optional \%attr parameter which can be used to pass information to the driver implementing the method. Except where specifically documented, the \%attr parameter can only be used to pass driver specific hints. In general, you can ignore \%attr parameters or pass it as undef."
457
458
459# https://www.guru99.com/insert-into.html
460# and https://dev.mysql.com/doc/refman/8.0/en/example-auto-increment.html
461# for inserting multiple rows at once
462# https://www.perlmonks.org/bare/?node_id=316183
463# https://metacpan.org/pod/DBI#do
464# https://www.quora.com/What-is-the-difference-between-prepare-and-do-statements-in-Perl-while-we-make-a-connection-to-the-database-for-executing-the-query
465# https://docstore.mik.ua/orelly/linux/dbi/ch05_05.htm
466
467# https://metacpan.org/pod/DBI#performance
468# 'The q{...} style quoting used in this example avoids clashing with quotes that may be used in the SQL statement. Use the double-quote like qq{...} operator if you want to interpolate variables into the string. See "Quote and Quote-like Operators" in perlop for more details.'
469sub prepare_insert_metadata_row_stmthandle {
470 my $self = shift (@_);
471 my ($did, $sid, $metaname, $metavalue) = @_;
472 my $dbh = $self->{'db_handle'};
473
474 my $tablename = $self->{'colname'}."_metadata";
475
476 #my $stmt = "INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES ('$did', '$sid', '$metaname', '$metavalue');"; # ?, ?, ?, ?
477
478 # using qq{} since we want $tablename placeholder to be filled in
479 # returns Statement Handle object!
480 my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)"}) || warn("Could not prepare insert statement for metadata table\n");
481
482 return $sth;
483}
484
485sub prepare_insert_fulltxt_row_stmthandle {
486 my $self = shift (@_);
487 my ($did, $sid, $fulltext) = @_;
488 my $dbh = $self->{'db_handle'};
489
490 my $tablename = $self->{'colname'}."_fulltxt";
491
492 #my $stmt = "INSERT INTO $tablename (did, sid, fulltxt) VALUES ('$did', '$sid', '$fulltext');"; ?, ?, ?
493
494 # using qq{} since we want $tablename placeholder to be filled in
495 # returns Statement Handle object!
496 my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)"}) || warn("Could not prepare insert statement for fulltxt table\n");
497
498 return $sth;
499}
500
501# I can get my version of table_exists to work, but it's not so ideal
502# Interesting that MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS,
503# see https://www.perlmonks.org/bare/?node=DBI%20Recipes
504# The page further has a table_exists function that could work with proper comparison
505# Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though
506sub table_exists {
507 my ($dbh,$table_name) = @_;
508
509 my @table_list = $dbh->tables;
510 #my $tables_str = @table_list[0];
511 foreach my $table (@table_list) {
512 return 1 if ($table =~ m/$table_name/);
513 }
514 return 0;
515}
516
5171;
Note: See TracBrowser for help on using the repository browser.