source: main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm@ 32592

Last change on this file since 32592 was 32592, checked in by ak19, 5 years ago

Renamed gssql.pm to gsmysql.pm. Not subclassing the old gssql into gsmysql yet, as there's the complex issue of sighandlers, the static singleton method _get_connection_instance(), the singleton variable _db_instance and its use in the sighandlers and DESTROY, and how all of this can be impacted when making them part of an inheritance chain. Not sure of the best way to structure inheritance around these things. Even if rollback_on_cancel ends up unnecessary, the singleton method _get_connection_instance and singleton object _db_instance still impact decisions around inheritance.

File size: 14.9 KB
RevLine 
[32518]1###########################################################################
2#
[32527]3# GreenstoneSQLPlugout.pm -- plugout module for writing all or some the
4# Greenstone document format (metadata and/or fulltext) into a (My)SQL db.
[32526]5# The rest is then still written out by GreenstoneXMLPlugout as usual.
[32518]6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[32527]28package GreenstoneSQLPlugout;
[32518]29
30use strict;
31no strict 'refs';
32no strict 'subs';
33
[32520]34use GreenstoneXMLPlugout;
[32518]35use docprint;
[32592]36use gsmysql;
[32518]37
[32524]38use DBI; # the central package for this plugout
39
[32518]40
[32583]41# + TODO: SIGTERM rollback and disconnect?
42# + TODO Q: what about verbosity for debugging, instead of current situation of printing out upon debug set at the expense of writing to db
[32591]43#+ TODO Q: introduced site param to plugins and plugouts. Did I do it right? And should they have hiddengli = "yes". No longer a param
44# !!!! Did I do the pass by ref in docprint's escape and unescape textref functions correctly, and how they're called here?
[32580]45# Any more optimisation I can do around this?
[32521]46
[32543]47# this plugout does not output the metadata and/or fulltxt xml to a file,
48# but outputs rows into a mysql table for metadata and/or a table for fulltxt
[32518]49sub BEGIN {
[32527]50 @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
[32518]51}
52
[32580]53# + NOTTODO: die() statements need to be replaced with premature_termination
[32529]54# which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW
55# It's fine: the die() stmts all take place before setting up the super class' begin
[32520]56
[32580]57# + TODO Q: about build_mode: how to detect removeold. Now handled by
[32563]58# GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin.
[32580]59# + TODO: deal with -removeold and everything? Or type out instructions for user
[32520]60
[32580]61# + TODO Q: what is "group" in GreenstoneXMLPlugout?
[32518]62
63my $process_mode_list =
64 [ { 'name' => "meta_only",
[32537]65 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
[32518]66 { 'name' => "text_only",
[32537]67 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
[32518]68 { 'name' => "all",
[32537]69 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
[32518]70
[32582]71my $rollback_on_cancel_list =
72 [ { 'name' => "true",
73 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}" },
74 { 'name' => "false",
75 'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ];
76
[32541]77# The following are the saveas.options:
[32518]78my $arguments = [
[32541]79 { 'name' => "process_mode",
80 'desc' => "{GreenstoneSQLPlug.process_mode}",
81 'type' => "enum",
82 'list' => $process_mode_list,
83 'deft' => "all",
84 'reqd' => "no",
85 'hiddengli' => "no"},
[32582]86 { 'name' => "rollback_on_cancel",
87 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}",
88 'type' => "enum",
89 'list' => $rollback_on_cancel_list,
[32591]90 'deft' => "false", # better default than true
[32582]91 'reqd' => "no",
92 'hiddengli' => "no"},
[32541]93 { 'name' => "db_driver",
94 'desc' => "{GreenstoneSQLPlug.db_driver}",
95 'type' => "string",
96 'deft' => "mysql",
97 'reqd' => "yes"},
98 { 'name' => "db_client_user",
99 'desc' => "{GreenstoneSQLPlug.db_client_user}",
100 'type' => "string",
101 'deft' => "root",
102 'reqd' => "yes"},
103 { 'name' => "db_client_pwd",
104 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
105 'type' => "string",
106 'deft' => "",
[32591]107 'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd
[32541]108 { 'name' => "db_host",
109 'desc' => "{GreenstoneSQLPlug.db_host}",
110 'type' => "string",
[32591]111 'deft' => "127.0.0.1", # localhost doesn't work for us, but 127.0.0.1 works. See gsmysql.pm
[32589]112 'reqd' => "yes"},
113 { 'name' => "db_port",
114 'desc' => "{GreenstoneSQLPlug.db_port}",
115 'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param
116 'reqd' => "no"}
[32541]117 ];
[32518]118
[32527]119my $options = { 'name' => "GreenstoneSQLPlugout",
120 'desc' => "{GreenstoneSQLPlugout.desc}",
[32518]121 'abstract' => "no",
122 'inherits' => "yes",
123 'args' => $arguments };
124
[32583]125##### This entire class is called only during import.pl #####
126
127##### Overridden methods #####
128
[32518]129sub new {
130 my ($class) = shift (@_);
131 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
132 push(@$plugoutlist, $class);
133
134 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
135 push(@{$hashArgOptLists->{"OptList"}},$options);
136
137 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
138
139 if ($self->{'info_only'}) {
140 # don't worry about any options etc
141 return bless $self, $class;
142 }
[32563]143 #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
[32518]144
145 return bless $self, $class;
146}
147
[32520]148# connect here and ensure all tables and databases exist
149sub begin {
150
151 my $self= shift (@_);
[32580]152
[32541]153 # The saveas.options
154 #print STDERR "@@@@ PLUGOUT db_pwd: " . $self->{'db_client_pwd'} . "\n";
155 #print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
156 #print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
157 #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
158 #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n";
159
[32520]160 ############ LOAD NECESSARY OPTIONS ###########
[32586]161 #print "@@@ plugout SITE NAME: ". $self->{'site'} . "\n" if defined $self->{'site'};
[32541]162 #print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
163
[32527]164 print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
[32529]165
166 my $db_params = {
[32560]167 'collection_name' => $ENV{'GSDLCOLLECTION'},
[32580]168 'verbosity' => $self->{'verbosity'} || 0
[32582]169
[32529]170 };
171
[32592]172 my $gs_sql = new gsmysql($db_params);
[32582]173
174 # if autocommit is set, there's no rollback support
175 my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0;
[32520]176
[32578]177 # try connecting to the mysql db, die if that fails
178 # So don't bother preparing GreenstoneXMLPlugout by calling superclass' begin() yet
[32530]179 if(!$gs_sql->connect_to_db({
180 'db_driver' => $self->{'db_driver'},
181 'db_client_user' => $self->{'db_client_user'},
182 'db_client_pwd' => $self->{'db_client_pwd'},
[32582]183 'db_host' => $self->{'db_host'},
[32589]184 'db_port' => $self->{'db_port'}, # undef by default, can leave as is
[32582]185 'autocommit' => $autocommit
[32530]186 })
187 )
188 {
[32520]189 # This is fatal for the plugout, let's terminate here
190 # PrintError would already have displayed the warning message on connection fail
191 die("Could not connect to db. Can't proceed.\n");
192 }
[32582]193
[32592]194 #die("@@@@ TEST. Connected successfully. Testing gsmysql::destructor.\n"); # WORKS
[32524]195
[32586]196 my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
[32563]197 my $proc_mode = $self->{'process_mode'};
[32555]198
[32563]199
200 my $success = $gs_sql->use_db($db_name);
201
202 if($success && $proc_mode ne "text_only") {
203 ##print STDERR "@@@@ Ensuring meta table exists\n";
204 $success = $gs_sql->ensure_meta_table_exists();
205 }
206 if($success && $proc_mode ne "meta_only") {
207 ##print STDERR "@@@@ Ensuring fulltxt table exists\n";
208 $success = $gs_sql->ensure_fulltxt_table_exists();
209 }
[32573]210
[32563]211 if(!$success) {
[32536]212 # This is fatal for the plugout, let's terminate here after disconnecting again
213 # PrintError would already have displayed the warning message on load fail
[32592]214 # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
[32583]215 #$gs_sql->force_disconnect_from_db(); # disconnect_from_db() will issue a warning on error
[32536]216 die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
[32520]217 }
218
[32529]219 # store the DBI wrapper instance
220 $self->{'gs_sql'} = $gs_sql;
221
[32524]222
[32521]223 # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
224 # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
225 # finally, can call begin on super - important as doc.xml is opened as a group etc
[32523]226
[32526]227 $self->SUPER::begin(@_);
[32520]228}
229
230# disconnect from database here, see inexport.pm
231sub end
232{
233 my $self = shift(@_);
234
[32521]235 # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
[32526]236 $self->SUPER::end(@_);
[32583]237
238 # Important to call finished():
[32592]239 # it will disconnect from db if this is the last gsmysql instance,
[32583]240 # and it will commit to db before disconnecting if rollbback_on_cancel turned on
241 $self->{'gs_sql'}->finished();
[32578]242 delete $self->{'gs_sql'}; # key gs_sql no longer exists, not just the value being undef
[32520]243}
[32533]244
[32542]245# Produce files called docsql.xml instead of doc.xml
[32536]246sub get_doc_xml_filename {
[32533]247 my $self = shift (@_);
[32536]248 my ($doc_obj) = @_;
249
[32542]250 return "docsql.xml";
[32533]251}
[32542]252
253# overriding to store doc OID as attribute of top level element: <Archive docoid="oid">
254sub output_xml_header {
255 my $self = shift (@_);
256 my ($outhandle, $doc_oid) = @_;
257
258 print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
259 print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
260 print $outhandle "<Archive docoid=\"$doc_oid\">\n";
261}
262
[32580]263# + X TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed?
[32543]264# That's only for indexing, not for this step which only generates the content in archives dir
[32518]265sub saveas {
266 my $self = shift (@_);
267 my ($doc_obj, $doc_dir) = @_;
268
[32522]269 my $proc_mode = $self->{'process_mode'};
270
[32521]271 # 1. pre save out and saving debug handle
272
[32523]273 # must call superclass (pre/post) saveas methods, as they handle assoc_files too
[32526]274 my ($docxml_outhandler, $output_file) = $self->SUPER::pre_saveas(@_);
[32523]275
276 $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
277
278 # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
[32543]279
[32520]280
[32543]281 # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files,
282 # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db
[32518]283
[32543]284 # NOTE: if proc_mode == all, then "breadcrumbs" (statements pointing viewer to the sql db
285 # for contents) go into both meta and txt elements of doc.xml (docsql.xml specifically):
[32522]286
[32543]287 # write the INVERSE into doc.xml as to what is written to the SQL db
[32523]288 my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
289 if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
290 $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
291 } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
292 $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
[32518]293 }
[32521]294
[32523]295 # now we've prepared to write out whatever is meant to go into docxml
296 # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
[32543]297 # So: write out the doc xml file, "docsql.xml", for the current document
[32523]298 my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
299 print $docxml_outhandler $section_text;
300
301
[32518]302 # We also write out whatever needs to go into the MySQL database
303 $self->write_meta_and_text($doc_obj);
304
[32520]305
[32583]306 # 3. post save out
[32526]307 $self->SUPER::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
[32521]308
[32523]309
[32521]310 # database connection is closed in end() method
311 # so we don't open and close over and over for each doc during a single build
[32518]312}
313
[32583]314##### New methods, not inherited #####
[32518]315
[32520]316# write meta and/or text PER DOC out to DB
[32518]317sub write_meta_and_text {
318 my $self = shift (@_);
319 my ($doc_obj) = @_;
[32531]320 my $doc_oid = $doc_obj->get_OID(); # this method processes a single doc at a time, so it uses the same OID throughout
[32518]321 my $root_section = $doc_obj->get_top_section();
322
[32573]323 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section);
[32521]324}
[32518]325
326# Perl: Reading or Writing to Another Program
327# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
328sub recursive_write_meta_and_text {
[32520]329 my $self = shift (@_);
[32573]330 my ($doc_obj, $doc_oid, $section) = @_;
[32520]331
332 # If section=ROOT, write "root" as section name into table
333 # doc->get_top_section() is the name of the doc root section, which is ""
334 my $section_name = ($section eq "") ? "root" : $section;
[32518]335
336 my $section_ptr = $doc_obj->_lookup_section ($section);
337 return "" unless defined $section_ptr;
338
[32520]339 my $debug_out = $self->{'debug_outhandle'};
[32573]340
341 my $gs_sql = $self->{'gs_sql'};
342 my $proc_mode = $self->{'process_mode'};
343 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
[32520]344
345 foreach my $data (@{$section_ptr->{'metadata'}}) {
346 my $meta_name = $data->[0];
[32536]347 # TODO: does it need to be stored escaped, as it requires unescaping when read back in
348 # from db (unlike for reading back in from doc.xml)
[32591]349
350 # Treat db like a text file instead of an html/xml file: don't need to escape text
351 # going into it
352 #my $escaped_meta_value = &docprint::escape_text($data->[1]);
353 my $meta_value = $data->[1];
[32580]354
[32520]355 # Write out the current section's meta to collection db's METADATA table
356
[32573]357 # For each set of values to write to meta table, this next method call will
358 # efficiently execute an insert SQL statement (using a prepared insert statement),
359 # filling in the values
360 # OR if debugging, then it will print the SQL insert statement but not execute it
[32580]361
[32591]362 $gs_sql->insert_row_into_metadata_table($doc_oid, $section_name, $meta_name, $meta_value, $self->{'debug'});
[32580]363 }
[32518]364 }
[32520]365
[32573]366
367 if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
[32591]368
369 # See above, no need to html-escape for db
370 my $section_text = $section_ptr->{'text'}; #&docprint::escape_textref(\$section_ptr->{'text'});
[32573]371
372 # fulltxt column can be SQL NULL. undef value gets written out as NULL:
373 # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
374 # The following will do the SQL insertion
375 # or if debug, the following will print the SQL insert stmt without executing it
[32591]376 $gs_sql->insert_row_into_fulltxt_table($doc_oid, $section_name, \$section_text, $self->{'debug'});
[32573]377
[32518]378 }
379
380 # output all subsections: RECURSIVE CALL
381 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
[32573]382 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection");
[32518]383 }
384}
385
386
3871;
Note: See TracBrowser for help on using the repository browser.