source: main/trunk/greenstone2/perllib/plugouts/MySQLPlugout.pm@ 32518

Last change on this file since 32518 was 32518, checked in by ak19, 6 years ago

Untested first attempt at MySQLPlugout that is meant to write metadata and/or fulltext to a mysql db rather than to doc.xml. We're switching over to using perl mysql libs, dbd (mysql) for drivers and dbi (mysql), for db access. However, I still want to commit this initial attempt at manually doing database operations in perl.

File size: 15.3 KB
RevLine 
[32518]1###########################################################################
2#
3# GreenstoneXMLPlugout.pm -- the plugout module for Greenstone Archives
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MySQLPlugout;
27
28use strict;
29no strict 'refs';
30no strict 'subs';
31
32use util;
33use FileUtils;
34use BasePlugout;
35use docprint;
36
37use IPC::Open2;
38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
39
40# this plugout does not output xml to a file, but outputs rows into a mysql table
41sub BEGIN {
42 @MySQLPlugout::ISA = ('GreenstoneXMLPlugout');
43}
44
45# TODO Q: what is "group" in GreenstoneXMLPlugout?
46
47my $process_mode_list =
48 [ { 'name' => "meta_only",
49 'desc' => "{MySQLPlugout.process_mode.meta_only}" },
50 { 'name' => "text_only",
51 'desc' => "{MySQLPlugout.process_mode.text_only}" },
52 { 'name' => "all",
53 'desc' => "{MySQLPlugout.process_mode.all}" } ];
54
55my $arguments = [
56 { 'name' => "process_mode",
57 'desc' => "{MySQLPlugout.process_mode}",
58 'type' => "enum",
59 'list' => $process_mode_list,
60 'deft' => "all",
61 'reqd' => "no",
62 'hiddengli' => "no"} ];
63
64my $options = { 'name' => "MySQLPlugout",
65 'desc' => "{MySQLPlugout.desc}",
66 'abstract' => "no",
67 'inherits' => "yes",
68 'args' => $arguments };
69
70sub new {
71 my ($class) = shift (@_);
72 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
73 push(@$plugoutlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
79
80 if ($self->{'info_only'}) {
81 # don't worry about any options etc
82 return bless $self, $class;
83 }
84 print STDERR "***** MySQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
85
86 return bless $self, $class;
87}
88
89# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed
90sub saveas {
91 my $self = shift (@_);
92 my ($doc_obj, $doc_dir) = @_;
93
94 # pre
95 my ($docxml_outhandler, $output_file) = $self->GreenstoneXMLPlugout::pre_saveas(@_);
96
97
98 print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
99 $self->{'collection_name'} = $ENV{'GSDLCOLLECTION'};
100 # set up DB and table or connect to DB and access the table here?
101
102 my $proc_mode = $self->{'process_mode'};
103 my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
104 if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
105 $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
106 } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
107 $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
108 }
109
110 # now we've prepared to write out whatever is meant to go into docxml
111 # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
112 # So: write out the doc xml file for the current document
113 my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
114 print $docxml_outhandler $section_text;
115
116 # We also write out whatever needs to go into the MySQL database
117 $self->write_meta_and_text($doc_obj);
118
119 # post
120 $self->GreenstoneXMLPlugout::post_saveas(@_);
121
122 # TODO: close database connection here? Or do something like with groups
123 # so we don't open and close over and over during a single build?
124}
125
126
127# write meta and/or text out to DB
128sub write_meta_and_text {
129 my $self = shift (@_);
130 my ($doc_obj) = @_;
131 my $root_section = $doc_obj->get_top_section();
132
133 # Do we don't want to open and close a connection per doc?
134 # Would we not rather want to open and close per collection rebuild?
135
136 #$self->create_db_connection();
137
138 $self->recursive_write_meta_and_text($doc_obj, $root_section);
139
140 #$self->close_db_connection();
141}
142# Perl: Reading or Writing to Another Program
143# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
144sub recursive_write_meta_and_text {
145 my ($doc_obj, $section) = @_;
146
147 # TODO
148 my $db_outhandler = undef;
149 binmode($db_outhandler,":utf8");
150 # TODO if $self->debug is on
151
152 my $section_ptr = $doc_obj->_lookup_section ($section);
153 return "" unless defined $section_ptr;
154
155 my $proc_mode = $self->{'process_mode'};
156
157 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
158 foreach my $data (@{$section_ptr->{'metadata'}}) {
159 my $escaped_value = &escape_text($data->[1]);
160 my $metaval = $data->[0];
161 # TODO: write out current section's text to collection db's META table
162 }
163 }
164 if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
165 my $section_text = &escape_text($section_ptr->{'text'});
166 # TODO: write out current section's text to collection db's TEXT table
167
168 }
169
170 # output all subsections: RECURSIVE CALL
171 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
172 &recursive_write_meta_and_text($doc_obj, "$section.$subsection");
173 }
174}
175
176#################################
177
178# Database access related functions
179# http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm
180# https://www.guru99.com/insert-into.html
181
182# TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)?
183# Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current?
184# https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data
185# BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying?
186# The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back.
187# Unless they do a full rebuild, which will recreate the tab;e from scratch?
188
189
190# I'm using perl's open2 like in Z3950Download, as opposed to open3 like in WgetDownload
191# since I'm assuming each insert statement is atomic: it either does the insertion or fails
192# and we (may) get some message back. That means we should hopefully be able to terminate
193# as well if we get SIGTERM/SIGKILL: we're not blocking, but are responsive after every INSERT stmt.
194
195# Just runs the command without displaying it
196sub run_command_basic
197{
198 my ($self,$strCMD) = @_;
199
200
201 my $process_instr = $self->{'MYSQL_IN'};
202
203 print $process_instr "$strCMD\n";
204}
205
206sub run_command
207{
208 my ($self,$strCMD) = @_;
209
210 print STDERR "Running mysql command: $strCMD\n";
211 $self->run_command_basic($strCMD);
212
213}
214
215sub response_line_contains {
216 my ($self,$expected) = @_;
217
218 my $opening_line = <$out>;
219 if ($opening_line =~ m/$expected/i) {
220 return 1;
221 }
222 return 0;
223}
224
225sub response_lines_contain {
226 my ($self,$expected) = @_;
227
228 my $found_expected = 0;
229 while (my $line = <$out>) {
230 print STDERR "$line\n";
231 if($line =~ m/$expected/){
232 #return 1;
233 $found_expected = 1; #won't break out of loop: loop will consume all on child out at present
234 }
235 }
236 return $found_expected;
237}
238
239# based on Z3950Download.pm::start_yaz()
240# ./PATH/TO/mysql -u root -p
241# Returns:
242# - 1 if client already quit after pwd fail (so can't send quit message to stopped client)
243# - 0 if load_db failed (needs to be quit),
244# - 1 if load_db succeeded (mysql client still running)
245sub create_db_connection {
246 my $self = shift (@_);
247 print STDERR "Opening connection to MySQL db\n";
248
249 my $mysql_client = $self->{'client_path'};
250 my $client_user = $self->{'client_user'} || "root";
251
252
253 my $launch_cmd = "\"./$mysql_client\" -u $clientuser -p";
254 my $childpid = open2(*MYSQL_OUT, *MYSQL_IN, $launch_cmd)
255 or (print STDERR "Done\n" and die "can't open2 pipe to mysql client: $!");
256
257 $self->{'pid'} = $childpid;
258 $self->{'MYSQL_OUT'} = *MYSQL_OUT;
259 $self->{'MYSQL_IN'} = *MYSQL_IN;
260
261 # connect with pwd and load the database for this site
262 #my $conn_open = $self->open_connection();
263 #
264 #if (!$conn_open) {
265 # print STDERR "Cannot connect to mysql db with $launch_cmd\n";
266 # print STDERR "Done\n";
267 # return 0;
268 #}
269
270 # connect with pwd
271 my $conn_success = $self->send_pwd();
272 if(!$conn_success) {
273 return -1; # if pwd failed, then the program already exited by itself
274 # (so don't send quit command after process terminated)
275 }
276 else {
277 return $self->load_db();
278 }
279
280 return $conn_open; # 1 if client already quit after pwd fail, 0 if load_db failed (needs to be quit), 1 if load_db succeeded (mysql client still running)
281}
282
283
284# Copied from Z3950Download.pm::quit_yaz()
285sub close_db_connection {
286 my $self = shift (@_);
287
288 $self->run_command("quit");
289 close($self->{'MYSQL_IN'}); # close the input to yaz. It also flushes quit command to mysql client
290
291 # make sure nothing is being output by mysql client
292 # flush the mysql-client process' outputstream, else we'll be stuck in an infinite
293 # loop waiting for the process to quit.
294 my $output = $self->{'MYSQL_OUT'};
295 my $line;
296 while (defined ($line = <$output>)) {
297 if($line !~ m/\w/s) { # print anything other than plain whitespace in case it is important
298 print STDERR "***### $line";
299 }
300 }
301
302 close($self->{'MYSQL_OUT'});
303
304 # Is the following necessary? The PerlDoc on open2 (http://perldoc.perl.org/IPC/Open2.html)
305 # says that waitpid must be called to "reap the child process", or otherwise it will hang
306 # around like a zombie process in the background. Adding it here makes the code work as
307 # before, but it is certainly necessary to call waitpid on wget (see WgetDownload.pm).
308 # http://perldoc.perl.org/functions/waitpid.html
309 my $kidpid;
310 do {
311 $kidpid = waitpid($self->{'pid'}, WNOHANG);
312 } while $kidpid > 0; # waiting for pid to become -1
313}
314
315# should be called only once per site
316sub create_database {
317 my $self = shift (@_);
318 #my $sitename = shift(@_);
319 my $sitename = $self->{'site_name'};
320 my $cmd = "CREATE DATABASE $sitename;";
321 $self->run_command($cmd);
322}
323
324
325sub send_pwd {
326 my $self = shift (@_);
327
328 my $client_pwd = $self->{'client_pwd'};
329 my $out = $self->{'MYSQL_OUT'};
330
331 # if connected, it's prompting for pwd. Write the pwd to the mysql client process:
332 $self->run_command_basic($client_pwd);
333
334 #my $opening_line = <$out>;
335 #if ($opening_line =~ m/Access denied/i) {
336 #print STDERR "Password not recognised. Got: $opening_line\n";
337 #return 0;
338 #}
339
340 if($self->response_line_contains("Access denied")) {
341 print STDERR "Password not recognised. Got: $opening_line\n";
342 return 0;
343 }
344
345 return 1;
346}
347
348
349sub load_db {
350 my $self = shift (@_);
351
352 # attempt to load the db
353 # use the database
354 my $db_name = $self->{'site_name'}; # TODO Q: site_name only exists for GS3. What about GS2?
355 $self->run_command("use " . $db_name . ";");
356
357 my $db_found = 0;
358 while (my $line = <$out>) {
359 print STDERR "$line\n";
360 if($line =~ m/Database changed/){
361 # return 1; # TODO Q: consume all output of running command
362 $db_found = 1;
363 }
364 elsif($line =~ m/Unknown database/){
365 $db_found = -1;
366 }
367 }
368
369 if($db_found == 1) {
370 return $db_found;
371 }
372 elsif ($db_found == -1) { # a db for the current sitename didn't exist, create it
373 $self->create_database();
374
375 # attempt to load the newly created db
376 if($self->_load_db()) {
377 return 1;
378 }
379 #my $opening_line = <$out>;
380 #if ($opening_line !~ m/Query OK/) {
381 if(!$self->response_line_contains("Query OK")) {
382 print STDERR "Could not create db\n";
383 return 0; # couldn't even create the db
384 } else { # success creating db
385 # so let's create the metadata and fulltxt tables for the current coll while we're at it
386 if($self->create_meta_table()) {
387 return $self->create_fulltxt_table();
388 }
389 }
390 }
391 else { # unknown error trying to load db, bail
392 return 0;
393 }
394}
395
396
397sub create_meta_table {
398 my $self = shift (@_);
399 my $table_name = $self->{'collection_name'} . "metadata";
400 #my $cmd = "CREATE TABLE $table_name (id VARCHAR(255) NOT NULL UNIQUE, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, metaname VARCHAR(127) NOT NULL, metavalue VARCHAR(1023) NOT NULL, PRIMARY KEY(id));";
401
402 # If using an auto incremented primary key:
403 my $cmd = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, metaname VARCHAR(127) NOT NULL, metavalue VARCHAR(1023) NOT NULL, PRIMARY KEY(id));";
404
405 $self->run_command($cmd);
406
407 if(!$self->response_lines_contain("Query OK")) {
408 print STDERR "Could not create metadata table\n";
409 return 0;
410 } else {
411 return 1;
412 }
413}
414
415sub create_fulltxt_table {
416 my $self = shift (@_);
417 my $table_name = $self->{'collection_name'} . "fulltxt";
418 #my $cmd = "CREATE TABLE $table_name (id VARCHAR(255) NOT NULL UNIQUE, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, fulltxt LONGTEXT, PRIMARY KEY(id));";
419
420 # If using an auto incremented primary key:
421 my $cmd = "CREATE TABLE $table_name (id INT NOT NULL AUTO_INCREMENT, did VARCHAR(63) NOT NULL, sid VARCHAR(63) NOT NULL, fulltxt LONGTEXT, PRIMARY KEY(id));";
422
423 $self->run_command($cmd);
424
425 if(!$self->response_lines_contain("Query OK")) {
426 print STDERR "Could not create table\n";
427 return 0;
428 } else {
429 return 1;
430 }
431}
432
433# https://www.guru99.com/insert-into.html
434# and https://dev.mysql.com/doc/refman/8.0/en/example-auto-increment.html
435# for inserting multiple rows at once
436sub get_cmd_insert_new_row_into_meta_table {
437 my $self = shift (@_);
438 my ($did, $sid, $metaname, $metavalue) = @_;
439 my $tablename = $self->{'colname'}_"metadata";
440
441 my $cmd = "INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES\n";
442 $cmd = "('$did', '$sid', '$metaname', '$metavalue');\n";
443 return $cmd;
444}
445
446sub get_cmd_insert_new_row_into_txt_table {
447 my $self = shift (@_);
448 my ($did, $sid, $fulltext) = @_;
449 my $tablename = $self->{'colname'}_"fulltxt";
450
451 my $cmd = "INSERT INTO $tablename (did, sid, fulltxt) VALUES\n";
452 $cmd = "('$did', '$sid', '$fulltext');\n";
453 return $cmd;
454}
455
456# TODO: later add edit and delete (and nothing for "been indexed" status?)
457
458##################
459
460## UNUSED
461sub X_open_connection()
462{
463 # connect with pwd
464 my $conn_success = $self->send_pwd();
465 if(!$conn_success) {
466 return -1; # if pwd failed, then the program already exists by itself (so don't send quit command after process terminated)
467 }
468
469 if($conn_success) {
470 return $self->load_db();
471 } else {
472 return $conn_sucess;
473 }
474}
475
476#################
477
4781;
Note: See TracBrowser for help on using the repository browser.