########################################################################### # # GreenstoneXMLPlugout.pm -- the plugout module for Greenstone Archives # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2006 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package MySQLPlugout; use strict; no strict 'refs'; no strict 'subs'; use util; use FileUtils; use GreenstoneXMLPlugout; use docprint; use DBI; # the central package for this plugout use IPC::Open2; use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html # TODO: SIGTERM rollback and disconnect? # this plugout does not output xml to a file, but outputs rows into a mysql table sub BEGIN { @MySQLPlugout::ISA = ('GreenstoneXMLPlugout'); } # TODO: deal with -removeold and everything? Or type out instructions for user # TODO Q: what is "group" in GreenstoneXMLPlugout? # TODO Q: site_name only exists for GS3. What about GS2? my $process_mode_list = [ { 'name' => "meta_only", 'desc' => "{MySQLPlugout.process_mode.meta_only}" }, { 'name' => "text_only", 'desc' => "{MySQLPlugout.process_mode.text_only}" }, { 'name' => "all", 'desc' => "{MySQLPlugout.process_mode.all}" } ]; my $arguments = [ { 'name' => "process_mode", 'desc' => "{MySQLPlugout.process_mode}", 'type' => "enum", 'list' => $process_mode_list, 'deft' => "all", 'reqd' => "no", 'hiddengli' => "no"} ]; my $options = { 'name' => "MySQLPlugout", 'desc' => "{MySQLPlugout.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($plugoutlist, $inputargs,$hashArgOptLists) = @_; push(@$plugoutlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } print STDERR "***** MySQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n"; return bless $self, $class; } # connect here and ensure all tables and databases exist sub begin { my $self= shift (@_); ########### TODO: these should be set from cmdline/GLI options to plugout ######### $self->{'db_driver'} = "mysql"; $self->{'site_name'} = "localsite"; $self->{'db_client_user'} = "root"; $self->{'db_client_pwd'} = "6reenstone3"; $self->{'build_mode'} = "removeold"; #$self->{'db_host'} = "127.0.0.1"; #$self->{'db_encoding'} = "utf8"; #TODO: proc_mode is also a saveas option ############ LOAD NECESSARY OPTIONS ########### print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n"; $self->{'collection_name'} = $ENV{'GSDLCOLLECTION'}; print STDERR "***** MySQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n"; if(!$self->connect_to_db()) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not connect to db. Can't proceed.\n"); } my $db_name = $self->{'site_name'} || "localsite"; # one database per GS3 site my $build_mode = $self->{'build_mode'} || "removeold"; if(!$self->load_db_and_tables($db_name, $build_mode)) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not use db or prepare its tables. Can't proceed.\n"); } # prepare the shared/common HANDLES to SQL insert statements that contain placeholders # and which we will reuse repeatedly when actually executing the insert statements my $proc_mode = $self->{'process_mode'}; if($proc_mode eq "all" || $proc_mode eq "meta_only" ) { $self->{'metadata_prepared_insert_statement_handle'} = $self->prepare_insert_metadata_row_stmthandle(); } if($proc_mode eq "all" || $proc_mode eq "text_only" ) { $self->{'fulltxt_prepared_insert_statement_handle'} = $self->prepare_insert_fulltxt_row_stmthandle(); } print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n"; print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n"; # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here: # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin() # finally, can call begin on super - important as doc.xml is opened as a group etc $self->GreenstoneXMLPlugout::begin(@_); } # disconnect from database here, see inexport.pm sub end { my $self = shift(@_); # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup $self->GreenstoneXMLPlugout::end(@_); $self->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer } # TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed sub saveas { my $self = shift (@_); my ($doc_obj, $doc_dir) = @_; print STDERR "\n\n@@@ In saveas\n\n"; my $proc_mode = $self->{'process_mode'}; # 1. pre save out and saving debug handle # must call superclass (pre/post) saveas methods, as they handle assoc_files too my ($docxml_outhandler, $output_file) = $self->GreenstoneXMLPlugout::pre_saveas(@_); $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed # TODO: remove unused old_unused_saveas from GreenstoneXMLPlugout # 2. overriding saving behaviour to do what the superclass does PLUS saving to sql db #NOTE: if proc_mode == all, then "breadcrumbs" go into both meta and txt elements of doc.xml: # statements pointing viewer to the sql db for contents # write the INVERSE into doc.xml as to what is written to the db my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE }; if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY; } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY; } # now we've prepared to write out whatever is meant to go into docxml # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml # So: write out the doc xml file for the current document my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options); print $docxml_outhandler $section_text; # We also write out whatever needs to go into the MySQL database $self->write_meta_and_text($doc_obj); # 3. post save out #$self->GreenstoneXMLPlugout::post_saveas(@_); $self->GreenstoneXMLPlugout::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file); # database connection is closed in end() method # so we don't open and close over and over for each doc during a single build } # write meta and/or text PER DOC out to DB sub write_meta_and_text { my $self = shift (@_); my ($doc_obj) = @_; my $root_section = $doc_obj->get_top_section(); my $doc_oid = $doc_obj->get_OID(); # we're processing a single doc at a time, so single OID # load the prepared INSERT statement handles for both tables (can be undef for any table depending on whether meta_only or txt_only are set) my $metadata_table_sth = $self->{'metadata_prepared_insert_statement_handle'}; my $fulltxt_table_sth = $self->{'fulltxt_prepared_insert_statement_handle'}; $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section, $metadata_table_sth, $fulltxt_table_sth); } # Perl: Reading or Writing to Another Program # https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm sub recursive_write_meta_and_text { my $self = shift (@_); my ($doc_obj, $doc_oid, $section, $metadata_table_sth, $fulltxt_table_sth) = @_; # If section=ROOT, write "root" as section name into table # doc->get_top_section() is the name of the doc root section, which is "" my $section_name = ($section eq "") ? "root" : $section; my $section_ptr = $doc_obj->_lookup_section ($section); return "" unless defined $section_ptr; my $debug_out = $self->{'debug_outhandle'}; print STDERR "#### Meta stmt: " . $metadata_table_sth->{'Statement'} . "\n"; print STDERR "#### Full stmt: " . $fulltxt_table_sth->{'Statement'} . "\n"; #my $proc_mode = $self->{'process_mode'}; #if($proc_mode eq "all" || $proc_mode eq "meta_only" ) { if($metadata_table_sth) { # meta insert statement handle will be undef if not writing meta foreach my $data (@{$section_ptr->{'metadata'}}) { my $meta_name = $data->[0]; my $escaped_meta_value = &docprint::escape_text($data->[1]); # Write out the current section's meta to collection db's METADATA table # for each set of values to write to meta table, execute the prepared statement, filling in the values if($self->{'debug'}) { # just print the statement we were going to execute print $debug_out $metadata_table_sth->{'Statement'} . "($doc_oid, $section_name, $meta_name, $escaped_meta_value)\n"; } else { $metadata_table_sth->execute($doc_oid, $section_name, $meta_name, $escaped_meta_value) || warn ("Unable to write metadata row to db:\n\tOID $doc_oid, section $section_name,\n\tmeta name: $meta_name, val: $escaped_meta_value"); # Execution failure will print out info anyway: since db connection sets PrintError } } } #if($proc_mode eq "all" || $proc_mode eq "text_only" ) { if($fulltxt_table_sth) { # fulltxt insert statement handle will be undef if not writing fulltxt if($self->{'debug'}) { # just print the statement we were going to execute, minus the fulltxt value my $txt_repr = $section_ptr->{'text'} ? "" : "NULL"; print $debug_out $fulltxt_table_sth->{'Statement'} . "($doc_oid, $section_name, $txt_repr)\n"; } else { my $section_text = &docprint::escape_text($section_ptr->{'text'}); # fulltxt column can be SQL NULL. undef value gets written out as NULL: # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string # Write out the current section's text to collection db's FULLTeXT table $fulltxt_table_sth->execute($doc_oid, $section_name, $section_text) || warn ("Unable to write fulltxt row to db for row:\n\tOID $doc_oid, section $section_name"); # Execution failure will print out info anyway: since db connection sets PrintError } } # output all subsections: RECURSIVE CALL foreach my $subsection (@{$section_ptr->{'subsection_order'}}) { &recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection", $metadata_table_sth, $fulltxt_table_sth); } } ################################# # Database access related functions # http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm # https://www.guru99.com/insert-into.html # TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)? # Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current? # https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data # BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying? # The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back. # Unless they do a full rebuild, which will recreate the table from scratch? # SOLUTION-> rollback transaction on error, see https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ # But then should set AutoCommit to off on connection, and remember to commit every time ################# # Database functions that use the perl DBI module (with the DBD driver module for mysql) ################# # THE NEW DB FUNCTIONS # NOTE: FULLTEXT is a reserved keyword in (My)SQL. So we can't name a table or any of its columns "fulltext". # https://dev.mysql.com/doc/refman/5.5/en/keywords.html # TODO: Consider AutoCommit status (and Autocommit off allowing commit or rollback for GS coll build cancel) later sub connect_to_db { my $self= shift (@_); my $db_driver = $self->{'db_driver'}; my $db_user = $self->{'db_client_user'} || "root"; my $db_pwd = $self->{'db_client_pwd'}; my $db_host = $self->{'db_host'} || "127.0.0.1"; my $db_enc = $self->{'db_encoding'} || "utf8"; #my $db_name = $self->{'site_name'}; # try connecting to the mysql db, if that fails it will die # so don't bother preparing GreenstoneXMLPlugout by calling superclass' begin() # localhost doesn't work for us, but 127.0.0.1 works # https://metacpan.org/pod/DBD::mysql # "The hostname, if not specified or specified as '' or 'localhost', will default to a MySQL server # running on the local machine using the default for the UNIX socket. To connect to a MySQL server # on the local machine via TCP, you must specify the loopback IP address (127.0.0.1) as the host." #my $connect_str = "dbi:$db_driver:database=$db_name;host=$db_host"; my $connect_str = "dbi:$db_driver:host=$db_host"; # don't provide db, so we can check the db is there my $dbh = DBI->connect("$connect_str", $db_user, $db_pwd, { ShowErrorStatement => 1, # more informative as DBI will append failed SQL stmt to error message PrintError => 1, # on by default, but being explicit RaiseError => 0, # off by default, but being explicit AutoCommit => 1 # on by default, but being explicit }); if(!$dbh) { # NOTE, despite handle dbh being undefined, error code will be in DBI->err return 0; } # set encoding https://metacpan.org/pod/DBD::mysql # https://dev.mysql.com/doc/refman/5.7/en/charset.html # https://dev.mysql.com/doc/refman/5.7/en/charset-conversion.html # Setting the encoding at db server level. # Not sure if this command is mysql specific: my $stmt = "set NAMES '" . $db_enc . "'"; $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # if we're here, then connection succeeded, store handle $self->{'db_handle'} = $dbh; return 1; } sub load_db_and_tables { my $self= shift (@_); my ($db_name, $build_mode) = @_; my $dbh = $self->{'db_handle'}; # perl DBI switch database: https://www.perlmonks.org/?node_id=995434 # do() returns undef on error. # connection succeeded, try to load our database. If that didn't work, attempt to create db my $success = $dbh->do("use $db_name"); if(!$success && $dbh->err == 1049) { # "Unknown database" error has code 1049 (mysql only?) meaning db doesn't exist yet # attempt to create the db and its tables $self->create_db($db_name) || return 0; print STDERR "@@@ CREATED DATABASE $db_name\n"; # once more attempt to use db, now that it exists $dbh->do("use $db_name") || return 0; #$dbh->do("use localsite") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n"; # attempt to create tables in current db $self->create_metadata_table() || return 0; $self->create_fulltext_table() || return 0; $success = 1; } elsif($success) { # database existed and loaded successfully, but # before proceeding check that the current collection's tables exist print STDERR "@@@ DATABASE $db_name EXISTED\n"; if($build_mode eq "removeold") { $self->delete_collection_tables(); } # use existing tables if any # attempt to create tables in current db if($build_mode eq "removeold" || !$self->table_exists($self->get_metadata_table_name())) { $self->create_metadata_table() || return 0; } else { print STDERR "@@@ Meta table exists\n"; } if($build_mode eq "removeold" || !$self->table_exists($self->get_fulltext_table_name())) { $self->create_fulltext_table() || return 0; } else { print STDERR "@@@ Fulltxt table exists\n"; } } return $success; # could still return 0, if database failed to load with an error code != 1049 } # disconnect from db - https://metacpan.org/pod/DBI#disconnect # TODO: make sure to have committed or rolled back before disconnect # and that you've call finish() on statement handles if any fetch remnants remain sub disconnect_from_db { my $self= shift (@_); my $dbh = $self->{'db_handle'}; # make sure any active stmt handles are finished # NO: "When all the data has been fetched from a SELECT statement, the driver will automatically call finish for you. So you should not call it explicitly except when you know that you've not fetched all the data from a statement handle and the handle won't be destroyed soon." #$meta_sth = $self->{'metadata_prepared_insert_statement_handle'}; #$txt_sth = $self->{'fulltxt_prepared_insert_statement_handle'}; #$meta_sth->finish() if($meta_sth); #$txt_sth->finish() if($txt_sth); my $rc = $dbh->disconnect or warn $dbh->errstr; # The handle is of little use after disconnecting. Possibly PrintError already prints a warning and this duplicates it? return $rc; } sub create_db { my $self= shift (@_); my $db_name = $self->{'site_name'}; my $dbh = $self->{'db_handle'}; # https://stackoverflow.com/questions/5025768/how-can-i-create-a-mysql-database-from-a-perl-script return $dbh->do("create database $db_name"); # do() will return undef on fail, https://metacpan.org/pod/DBI#do } sub create_metadata_table { my $self= shift (@_); my $dbh = $self->{'db_handle'}; my $table_name = $self->get_metadata_table_name(); # If using an auto incremented primary key: my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, metaname VARCHAR(127) NOT NULL, metavalue VARCHAR(1023) NOT NULL, PRIMARY KEY(id));"; return $dbh->do($stmt); } # TODO: Investigate: https://dev.mysql.com/doc/search/?d=10&p=1&q=FULLTEXT # 12.9.1 Natural Language Full-Text Searches # to see whether we have to index the 'fulltxt' column of the 'fulltext' tables # or let user edit this file, or add it as another option sub create_fulltext_table { my $self= shift (@_); my $dbh = $self->{'db_handle'}; my $table_name = $self->get_fulltext_table_name(); # If using an auto incremented primary key: my $stmt = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, fulltxt LONGTEXT, PRIMARY KEY(id));"; return $dbh->do($stmt); } # USEFUL: https://metacpan.org/pod/DBI # "Many methods have an optional \%attr parameter which can be used to pass information to the driver implementing the method. Except where specifically documented, the \%attr parameter can only be used to pass driver specific hints. In general, you can ignore \%attr parameters or pass it as undef." # https://www.guru99.com/insert-into.html # and https://dev.mysql.com/doc/refman/8.0/en/example-auto-increment.html # for inserting multiple rows at once # https://www.perlmonks.org/bare/?node_id=316183 # https://metacpan.org/pod/DBI#do # https://www.quora.com/What-is-the-difference-between-prepare-and-do-statements-in-Perl-while-we-make-a-connection-to-the-database-for-executing-the-query # https://docstore.mik.ua/orelly/linux/dbi/ch05_05.htm # https://metacpan.org/pod/DBI#performance # 'The q{...} style quoting used in this example avoids clashing with quotes that may be used in the SQL statement. Use the double-quote like qq{...} operator if you want to interpolate variables into the string. See "Quote and Quote-like Operators" in perlop for more details.' sub prepare_insert_metadata_row_stmthandle { my $self = shift (@_); #my ($did, $sid, $metaname, $metavalue) = @_; my $dbh = $self->{'db_handle'}; my $tablename = $self->get_metadata_table_name(); #my $stmt = "INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES ('$did', '$sid', '$metaname', '$metavalue');"; # ?, ?, ?, ? # using qq{} since we want $tablename placeholder to be filled in # returns Statement Handle object! my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)}) || warn("Could not prepare insert statement for metadata table\n"); print STDERR "@@@@ Prepared meta insert statement: ".$sth->{'Statement'}."\n"; return $sth; } sub prepare_insert_fulltxt_row_stmthandle { my $self = shift (@_); #my ($did, $sid, $fulltext) = @_; my $dbh = $self->{'db_handle'}; my $tablename = $self->get_fulltext_table_name(); #my $stmt = "INSERT INTO $tablename (did, sid, fulltxt) VALUES ('$did', '$sid', '$fulltext');"; ?, ?, ? # using qq{} since we want $tablename placeholder to be filled in # returns Statement Handle object! my $sth = $dbh->prepare(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)}) || warn("Could not prepare insert statement for fulltxt table\n"); print STDERR "@@@@ Prepared fulltext insert statement: ".$sth->{'Statement'}."\n"; return $sth; } # "IF EXISTS is used to prevent an error from occurring if the database does not exist. ... DROP DATABASE returns the number of tables that were removed. The DROP DATABASE statement removes from the given database directory those files and directories that MySQL itself may create during normal operation.Jun 20, 2012" #MySQL 8.0 Reference Manual :: 13.1.22 DROP DATABASE Syntax # https://dev.mysql.com/doc/en/drop-database.html sub delete_collection_tables { my $self= shift (@_); my $dbh = $self->{'db_handle'}; print STDERR "### Build mode is removeold, so deleting tables for current collection\n"; # drop table my $table = $self->get_metadata_table_name(); $dbh->do("drop table $table") || warn("@@@ Couldn't delete $table"); $table = $self->get_fulltext_table_name(); $dbh->do("drop table $table") || warn("@@@ Couldn't delete $table"); } # Don't call this: it will delete the meta and full text tables for ALL collections in $db_name (localsite by default)! # this is just for debugging sub _delete_database { my $self= shift (@_); my ($db_name) = @_; my $dbh = $self->{'db_handle'}; # "drop database dbname" $dbh->do("drop database $db_name") || return 0; return 1; } # More basic helper methods sub get_metadata_table_name { my $self= shift (@_); my $table_name = $self->{'collection_name'} . "_metadata"; return $table_name; } # FULLTEXT is a reserved keyword in (My)SQL. https://dev.mysql.com/doc/refman/5.5/en/keywords.html # So we can't name a table or any of its columns "fulltext". We use "fulltxt" instead. sub get_fulltext_table_name { my $self= shift (@_); my $table_name = $self->{'collection_name'} . "_fulltxt"; return $table_name; } # I can get my version of table_exists to work, but it's not so ideal # Interesting that MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS, # see https://www.perlmonks.org/bare/?node=DBI%20Recipes # The page further has a table_exists function that could work with proper comparison # Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though sub table_exists { my $self = shift (@_); my $dbh = $self->{'db_handle'}; my ($table_name) = @_; my @table_list = $dbh->tables; #my $tables_str = @table_list[0]; foreach my $table (@table_list) { return 1 if ($table =~ m/$table_name/); } return 0; } 1;