root/main/trunk/greenstone2/perllib/plugouts/MySQLPlugout.pm @ 32523

Revision 32523, 21.6 KB (checked in by ak19, 21 months ago)

UNDOING PART OF RECENT COMMIT: Calling superclass GreenstoneXMLPlugout's begin(), end(), close_group_output(), pre_saveas() and post_saveas() only if MySQLPlugout's process mode is not set to output ALL (meta and text) to sql db. But they should be called at all times even when process mode is set to output all, because assoc files must be written out and breadcrumbs must be written to doc.xml.

Line 
1###########################################################################
2#
3# GreenstoneXMLPlugout.pm -- the plugout module for Greenstone Archives
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MySQLPlugout;
27
28use strict;
29no strict 'refs';
30no strict 'subs';
31
32use util;
33use FileUtils;
34use GreenstoneXMLPlugout;
35use docprint;
36
37use IPC::Open2;
38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
39
40# TODO: SIGTERM rollback and disconnect?
41
42
43# this plugout does not output xml to a file, but outputs rows into a mysql table
44sub BEGIN {
45    @MySQLPlugout::ISA = ('GreenstoneXMLPlugout');
46}
47
48
49# TODO: deal with -removeold and everything? Or type out instructions for user
50
51# TODO Q: what is "group" in GreenstoneXMLPlugout?
52# TODO Q: site_name only exists for GS3. What about GS2?
53
54my $process_mode_list =
55    [ { 'name' => "meta_only",
56        'desc' => "{MySQLPlugout.process_mode.meta_only}" },     
57      { 'name' => "text_only",
58        'desc' => "{MySQLPlugout.process_mode.text_only}" },
59      { 'name' => "all",
60        'desc' => "{MySQLPlugout.process_mode.all}" } ];
61
62my $arguments = [
63       { 'name' => "process_mode",
64     'desc' => "{MySQLPlugout.process_mode}",
65     'type' => "enum",
66     'list' => $process_mode_list,
67     'deft' => "all",
68     'reqd' => "no",
69     'hiddengli' => "no"} ];
70
71my $options = { 'name'     => "MySQLPlugout",
72        'desc'     => "{MySQLPlugout.desc}",
73        'abstract' => "no",
74        'inherits' => "yes",
75        'args'     => $arguments };
76
77sub new {
78    my ($class) = shift (@_);
79    my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
80    push(@$plugoutlist, $class);
81
82    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
83    push(@{$hashArgOptLists->{"OptList"}},$options);
84
85    my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
86   
87    if ($self->{'info_only'}) {
88        # don't worry about any options etc
89        return bless $self, $class;
90    }
91    print STDERR "***** MySQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
92   
93    return bless $self, $class;
94}
95
96# connect here and ensure all tables and databases exist
97sub begin {
98
99    my $self= shift (@_);
100
101    ########### TODO: these should be set from cmdline/GLI options to plugout #########
102    $self->{'db_driver'} = "mysql";
103    $self->{'site_name'} = "localsite";   
104    $self->{'db_client_user'} = "root";
105    $self->{'db_client_pwd'} = "6reenstone3";
106    #$self->{'db_host'} = "127.0.0.1";
107    #$self->{'db_encoding'} = "utf8";
108    #TODO: proc_mode is also a saveas option
109   
110    ############ LOAD NECESSARY OPTIONS ###########
111    print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
112    $self->{'collection_name'} = $ENV{'GSDLCOLLECTION'};
113   
114    if(!$self->connect_to_db()) {
115    # This is fatal for the plugout, let's terminate here
116    # PrintError would already have displayed the warning message on connection fail   
117    die("Could not connect to db. Can't proceed.\n");
118    }
119    if(!$self->load_db_and_tables()) {
120    # This is fatal for the plugout, let's terminate here
121    # PrintError would already have displayed the warning message on connection fail   
122    die("Could not use db or prepare its tables. Can't proceed.\n");
123    }
124
125    # prepare the shared/common HANDLES to SQL insert statements that contain placeholders
126    # and which we will reuse repeatedly when actually executing the insert statements
127    my $proc_mode = $self->{'process_mode'};
128    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
129    $self->{'metadata_prepared_insert_statement_handle'} = $self->prepare_insert_metadata_row_stmthandle();
130    }
131    if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
132    $self->{'fulltxt_prepared_insert_statement_handle'} = $self->prepare_insert_fulltxt_row_stmthandle();
133    }
134
135    # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
136    # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
137    # finally, can call begin on super - important as doc.xml is opened as a group etc
138   
139    $self->GreenstoneXMLPlugout::begin(@_);
140}
141
142# disconnect from database here, see inexport.pm
143sub end
144{
145    my $self = shift(@_);
146
147    # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
148    $self->GreenstoneXMLPlugout::end(@_);   
149   
150    $self->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer
151}
152 
153   
154# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed
155sub saveas {
156    my $self = shift (@_);
157    my ($doc_obj, $doc_dir) = @_;
158
159    my $proc_mode = $self->{'process_mode'};
160   
161    # 1. pre save out and saving debug handle
162
163    # must call superclass (pre/post) saveas methods, as they handle assoc_files too
164    my ($docxml_outhandler, $output_file) = $self->GreenstoneXMLPlugout::pre_saveas(@_);
165
166    $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
167
168    # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
169
170    # TODO: remove unused old_unused_saveas from GreenstoneXMLPlugout
171   
172   
173    # 2. overriding saving behaviour to do what the superclass does PLUS saving to sql db   
174
175    #NOTE: if proc_mode == all, then "breadcrumbs" go into both meta and txt elements of doc.xml:
176    # statements pointing viewer to the sql db for contents
177   
178    # write the INVERSE into doc.xml as to what is written to the db   
179    my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
180    if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
181    $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
182    } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
183    $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
184    }
185   
186    # now we've prepared to write out whatever is meant to go into docxml
187    # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
188    # So: write out the doc xml file for the current document
189    my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
190    print $docxml_outhandler $section_text;   
191   
192   
193    # We also write out whatever needs to go into the MySQL database
194    $self->write_meta_and_text($doc_obj);
195
196   
197    # 3. post save out
198    #$self->GreenstoneXMLPlugout::post_saveas(@_);
199    $self->GreenstoneXMLPlugout::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
200   
201   
202    # database connection is closed in end() method
203    # so we don't open and close over and over for each doc during a single build
204}
205
206
207# write meta and/or text PER DOC out to DB
208sub write_meta_and_text {
209    my $self = shift (@_);
210    my ($doc_obj) = @_;
211    my $root_section = $doc_obj->get_top_section();
212    my $doc_oid = $doc_obj->get_OID(); # we're processing a single doc at a time, so single OID
213
214    # load the prepared INSERT statement handles for both tables (can be undef for any table depending on whether meta_only or txt_only are set)
215    my $metadata_table_sth = $self->{'metadata_prepared_insert_statement_handle'};
216    my $fulltxt_table_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
217   
218    $self->recursive_write_meta_and_text($doc_obj, $root_section, $metadata_table_sth, $fulltxt_table_sth);
219}
220
221# Perl: Reading or Writing to Another Program
222# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
223sub recursive_write_meta_and_text {
224    my $self = shift (@_);
225    my ($doc_obj, $doc_oid, $section, $metadata_table_sth, $fulltxt_table_sth) = @_;   
226
227    # If section=ROOT, write "root" as section name into table
228    # doc->get_top_section() is the name of the doc root section, which is ""
229    my $section_name = ($section eq "") ? "root" : $section;
230   
231    my $section_ptr = $doc_obj->_lookup_section ($section);
232    return "" unless defined $section_ptr;
233
234    my $debug_out = $self->{'debug_outhandle'};
235   
236    #my $proc_mode = $self->{'process_mode'};
237    #if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
238    if($metadata_table_sth) { # meta insert statement handle will be undef if not writing meta
239   
240    foreach my $data (@{$section_ptr->{'metadata'}}) {
241        my $meta_name = $data->[0];
242        my $escaped_meta_value = &escape_text($data->[1]);
243
244        # Write out the current section's meta to collection db's METADATA table       
245       
246        # for each set of values to write to meta table, execute the prepared statement, filling in the values
247
248        if($self->{'debug'}) {
249        # just print the statement we were going to execute
250
251        print $debug_out $metadata_table_sth->Statement . "($doc_oid, $section_name, $meta_name, $escaped_meta_value)\n";
252        }
253        else {
254       
255        $metadata_table_sth->execute($doc_oid, $section_name, $meta_name, $escaped_meta_value);
256        #|| warn ("Unable to write metadata row to db:\n\tOID $doc_oid, section $section_name,\n\tmeta name: $meta_name, val: $escaped_meta_value");
257        # Execution failure will print out info anyway: since db connection sets PrintError
258        }
259    }
260    }
261   
262    #if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
263    if($fulltxt_table_sth) { # fulltxt insert statement handle will be undef if not writing fulltxt
264
265    if($self->{'debug'}) {
266        # just print the statement we were going to execute, minus the fulltxt value
267        my $txt_repr = $section_ptr->{'text'} ? "<TXT>" : "NULL";
268        print $debug_out $fulltxt_table_sth->Statement . "($doc_oid, $section_name, $txt_repr)\n";
269    } else {
270        my $section_text = &escape_text($section_ptr->{'text'});
271       
272        # fulltxt column can be SQL NULL. undef value gets written out as NULL:
273        # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
274       
275        # Write out the current section's text to collection db's FULLTeXT table
276        $fulltxt_table_sth->execute($doc_oid, $section_name, $section_text);
277        #|| warn ("Unable to write fulltxt row to db for row:\n\tOID $doc_oid, section $section_name");
278        # Execution failure will print out info anyway: since db connection sets PrintError
279    }
280    }
281   
282    # output all subsections: RECURSIVE CALL
283    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
284    &recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection", $metadata_table_sth, $fulltxt_table_sth);
285    }
286}
287
288#################################
289
290# Database access related functions
291# http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm
292# https://www.guru99.com/insert-into.html
293
294# TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)?
295# Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current?
296# https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data
297# BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying?
298# The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back.
299# Unless they do a full rebuild, which will recreate the table from scratch?
300# SOLUTION-> rollback transaction on error, see https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
301# But then should set AutoCommit to off on connection, and remember to commit every time
302
303#################
304# Database functions that use the perl DBI module (with the DBD driver module for mysql)
305#################
306
307# THE NEW DB FUNCTIONS
308# NOTE: FULLTEXT is a reserved keyword in (My)SQL. So we can't name a table or any of its columns "fulltext".
309# https://dev.mysql.com/doc/refman/5.5/en/keywords.html
310
311# TODO: Consider AutoCommit status (and Autocommit off allowing commit or rollback for GS coll build cancel) later
312
313sub connect_to_db {
314    my $self= shift (@_);
315   
316    my $db_driver = $self->{'db_driver'};
317    my $db_user = $self->{'db_client_user'} || "root";
318    my $db_pwd = $self->{'db_client_pwd'};
319    my $db_host = $self->{'db_host'} || "127.0.0.1";
320    my $db_enc = $self->{'db_encoding'} || "utf8";
321   
322    #my $db_name = $self->{'site_name'};
323   
324    # try connecting to the mysql db, if that fails it will die
325    # so don't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
326
327    # localhost doesn't work for us, but 127.0.0.1 works
328    # https://metacpan.org/pod/DBD::mysql
329    # "The hostname, if not specified or specified as '' or 'localhost', will default to a MySQL server
330    # running on the local machine using the default for the UNIX socket. To connect to a MySQL server
331    # on the local machine via TCP, you must specify the loopback IP address (127.0.0.1) as the host."
332    #my $connect_str = "dbi:$db_driver:database=$db_name;host=$db_host";
333    my $connect_str = "dbi:$db_driver:host=$db_host"; # don't provide db, so we can check the db is there
334    my $dbh = DBI->connect("$connect_str", $db_user, $db_pwd,
335               {
336                   ShowErrorStatement => 1, # more informative as DBI will append failed SQL stmt to error message
337                   PrintError => 1, # on by default, but being explicit
338                   RaiseError => 0, # off by default, but being explicit
339                   AutoCommit => 1, # on by default, but being explicit
340               });
341
342    if(!$dbh) {
343    # NOTE, despite handle dbh being undefined, error code will be in DBI->err
344    return 0;   
345    }
346
347    # set encoding https://metacpan.org/pod/DBD::mysql
348    # https://dev.mysql.com/doc/refman/5.7/en/charset.html
349    # https://dev.mysql.com/doc/refman/5.7/en/charset-conversion.html
350    # Setting the encoding at db server level.
351    # Not sure if this command is mysql specific:
352    my $stmt = "set NAMES '" . $db_enc . "'";
353    $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n");
354   
355    # if we're here, then connection succeeded, store handle
356    $self->{'db_handle'} = $dbh;
357    return 1;
358}
359
360sub load_db_and_tables {
361    my $self= shift (@_);
362    my $db_name = $self->{'site_name'}; # one database per GS3 site
363    my $dbh = $self->{'db_handle'};
364   
365    # perl DBI switch database: https://www.perlmonks.org/?node_id=995434
366    # do() returns undef on error.
367    # connection succeeded, try to load our database. If that didn't work, attempt to create db
368    my $success = $dbh->do("use $db_name");
369   
370    if(!$success && $dbh->err == 1049) { # "Unknown database" error has code 1049 (mysql only?) meaning db doesn't exist yet
371    # attempt to create the db and its tables
372    $self->create_db($db_name) || return 0;
373
374    # once more attempt to use db, now that it exists
375    $dbh->do("use $db_name") || return 0;
376    #$dbh->do("use localsite") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n";
377
378    # attempt to create tables in current db
379    $self->create_metadata_table() || return 0;
380    $self->create_fulltext_table() || return 0;
381
382    $success = 1;
383    }
384    elsif($success) { # database existed and loaded successfully, but
385    # before proceeding check that the current collection's tables exist
386   
387    # attempt to create tables in current db
388    if(!$self->table_exists($self->{'collection_name'} . "metadata")) {
389        $self->create_metadata_table() || return 0;
390    }
391    if(!$self->table_exists($self->{'collection_name'} .  "fulltxt")) {
392        $self->create_fulltext_table() || return 0;
393    }
394    }
395   
396    return $success; # could still return 0, if database failed to load with an error code != 1049
397}
398
399# disconnect from db - https://metacpan.org/pod/DBI#disconnect
400# TODO: make sure to have committed or rolled back before disconnect
401# and that you've call finish() on statement handles if any fetch remnants remain
402sub disconnect_from_db {
403    my $self= shift (@_);   
404    my $dbh = $self->{'db_handle'};
405
406    # make sure any active stmt handles are finished
407    # NO: "When all the data has been fetched from a SELECT statement, the driver will automatically call finish for you. So you should not call it explicitly except when you know that you've not fetched all the data from a statement handle and the handle won't be destroyed soon."
408   
409    #$meta_sth = $self->{'metadata_prepared_insert_statement_handle'};
410    #$txt_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
411    #$meta_sth->finish() if($meta_sth);
412    #$txt_sth->finish() if($txt_sth);
413   
414    my $rc = $dbh->disconnect or warn $dbh->errstr; # The handle is of little use after disconnecting. Possibly PrintError already prints a warning and this duplicates it?
415    return $rc;
416}
417
418sub create_db {
419    my $self= shift (@_);
420    my $db_name = $self->{'site_name'};
421    my $dbh = $self->{'db_handle'};
422   
423    # https://stackoverflow.com/questions/5025768/how-can-i-create-a-mysql-database-from-a-perl-script
424    return $dbh->do("create database $db_name"); # do() will return undef on fail, https://metacpan.org/pod/DBI#do
425}
426
427sub create_metadata_table {
428    my $self= shift (@_);
429    my $dbh = $self->{'db_handle'};
430   
431    my $table_name = $self->{'collection_name'} . "metadata";
432
433    # If using an auto incremented primary key:
434    my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, metaname VARCHAR(127) NOT NULL, metavalue VARCHAR(1023) NOT NULL, PRIMARY KEY(id));";
435    return $dbh->do($stmt);
436}
437
438# TODO: Investigate: https://dev.mysql.com/doc/search/?d=10&p=1&q=FULLTEXT
439# 12.9.1 Natural Language Full-Text Searches
440# to see whether we have to index the 'fulltxt' column of the 'fulltext' tables
441# or let user edit this file, or add it as another option
442sub create_fulltext_table {
443    my $self= shift (@_);
444    my $dbh = $self->{'db_handle'};
445   
446    my $table_name = $self->{'collection_name'} . "fulltxt";
447
448    # If using an auto incremented primary key:
449    my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, fulltxt LONGTEXT, PRIMARY KEY(id));";
450    return $dbh->do($stmt);
451
452}
453
454
455# USEFUL: https://metacpan.org/pod/DBI
456# "Many methods have an optional \%attr parameter which can be used to pass information to the driver implementing the method. Except where specifically documented, the \%attr parameter can only be used to pass driver specific hints. In general, you can ignore \%attr parameters or pass it as undef."
457
458
459# https://www.guru99.com/insert-into.html
460# and https://dev.mysql.com/doc/refman/8.0/en/example-auto-increment.html
461#     for inserting multiple rows at once
462# https://www.perlmonks.org/bare/?node_id=316183
463# https://metacpan.org/pod/DBI#do
464# https://www.quora.com/What-is-the-difference-between-prepare-and-do-statements-in-Perl-while-we-make-a-connection-to-the-database-for-executing-the-query
465# https://docstore.mik.ua/orelly/linux/dbi/ch05_05.htm
466
467# https://metacpan.org/pod/DBI#performance
468# 'The q{...} style quoting used in this example avoids clashing with quotes that may be used in the SQL statement. Use the double-quote like qq{...} operator if you want to interpolate variables into the string. See "Quote and Quote-like Operators" in perlop for more details.'
469sub prepare_insert_metadata_row_stmthandle {
470    my $self = shift (@_);   
471    my ($did, $sid, $metaname, $metavalue) = @_;
472    my $dbh = $self->{'db_handle'};
473   
474    my $tablename = $self->{'colname'}."_metadata";
475
476    #my $stmt = "INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES ('$did', '$sid', '$metaname', '$metavalue');"; # ?, ?, ?, ?
477
478    # using qq{} since we want $tablename placeholder to be filled in
479    # returns Statement Handle object!
480    my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)"}) || warn("Could not prepare insert statement for metadata table\n");
481   
482    return $sth;
483}
484
485sub prepare_insert_fulltxt_row_stmthandle {
486    my $self = shift (@_);
487    my ($did, $sid, $fulltext) = @_;
488    my $dbh = $self->{'db_handle'};
489   
490    my $tablename = $self->{'colname'}."_fulltxt";
491
492    #my $stmt = "INSERT INTO $tablename (did, sid, fulltxt) VALUES ('$did', '$sid', '$fulltext');"; ?, ?, ?
493
494    # using qq{} since we want $tablename placeholder to be filled in
495    # returns Statement Handle object!
496    my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)"}) || warn("Could not prepare insert statement for fulltxt table\n");
497   
498    return $sth;
499}
500
501# I can get my version of table_exists to work, but it's not so ideal
502# Interesting that MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS,
503# see https://www.perlmonks.org/bare/?node=DBI%20Recipes
504#    The page further has a table_exists function that could work with proper comparison
505# Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though
506sub table_exists {
507    my ($dbh,$table_name) = @_;
508
509    my @table_list = $dbh->tables;
510    #my $tables_str = @table_list[0];
511    foreach my $table (@table_list) {
512    return 1 if ($table =~ m/$table_name/);
513    }
514    return 0;
515}
516
5171;
Note: See TracBrowser for help on using the browser.