source: main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm@ 32578

Last change on this file since 32578 was 32578, checked in by ak19, 5 years ago

Optimising. The gssql class internally has only one shared connection to the db, making the connection only the first time and disconnecting only when the last gssql is finished(). For keywords: this is implemented using the singleton coding (anti-) pattern. Now each perl process (import or buildcol) will connect to the SQL DB only once, not twice during import where it used to be once for GS SQL plugout and once for GSSQL plugin.

File size: 13.2 KB
RevLine 
[32518]1###########################################################################
2#
[32527]3# GreenstoneSQLPlugout.pm -- plugout module for writing all or some the
4# Greenstone document format (metadata and/or fulltext) into a (My)SQL db.
[32526]5# The rest is then still written out by GreenstoneXMLPlugout as usual.
[32518]6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[32527]28package GreenstoneSQLPlugout;
[32518]29
30use strict;
31no strict 'refs';
32no strict 'subs';
33
[32520]34use GreenstoneXMLPlugout;
[32518]35use docprint;
[32529]36use gssql;
[32518]37
[32524]38use DBI; # the central package for this plugout
39
[32518]40
[32521]41# TODO: SIGTERM rollback and disconnect?
[32543]42# TODO Q: what about verbosity for debugging, instead of current situation of printing out upon debug set at the expense of writing to db
[32541]43# TODO Q: introduced site_name param to plugins and plugouts. Did I do it right? And should they have hiddengli = "yes"
[32521]44
[32543]45# this plugout does not output the metadata and/or fulltxt xml to a file,
46# but outputs rows into a mysql table for metadata and/or a table for fulltxt
[32518]47sub BEGIN {
[32527]48 @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
[32518]49}
50
[32529]51# NOTTODO: die() statements need to be replaced with premature_termination
52# which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW
53# It's fine: the die() stmts all take place before setting up the super class' begin
[32520]54
[32563]55# TODO Q: about build_mode: how to detect removeold. Now handled by
56# GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin.
[32520]57# TODO: deal with -removeold and everything? Or type out instructions for user
58
[32518]59# TODO Q: what is "group" in GreenstoneXMLPlugout?
60
61my $process_mode_list =
62 [ { 'name' => "meta_only",
[32537]63 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
[32518]64 { 'name' => "text_only",
[32537]65 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
[32518]66 { 'name' => "all",
[32537]67 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
[32518]68
[32541]69# The following are the saveas.options:
[32518]70my $arguments = [
[32541]71 { 'name' => "process_mode",
72 'desc' => "{GreenstoneSQLPlug.process_mode}",
73 'type' => "enum",
74 'list' => $process_mode_list,
75 'deft' => "all",
76 'reqd' => "no",
77 'hiddengli' => "no"},
78 { 'name' => "db_driver",
79 'desc' => "{GreenstoneSQLPlug.db_driver}",
80 'type' => "string",
81 'deft' => "mysql",
82 'reqd' => "yes"},
83 { 'name' => "db_client_user",
84 'desc' => "{GreenstoneSQLPlug.db_client_user}",
85 'type' => "string",
86 'deft' => "root",
87 'reqd' => "yes"},
88 { 'name' => "db_client_pwd",
89 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
90 'type' => "string",
91 'deft' => "",
92 'reqd' => "yes"}, # pwd required?
93 { 'name' => "db_host",
94 'desc' => "{GreenstoneSQLPlug.db_host}",
95 'type' => "string",
96 'deft' => "127.0.0.1",
97 'reqd' => "yes"}
98 ];
[32518]99
[32527]100my $options = { 'name' => "GreenstoneSQLPlugout",
101 'desc' => "{GreenstoneSQLPlugout.desc}",
[32518]102 'abstract' => "no",
103 'inherits' => "yes",
104 'args' => $arguments };
105
106sub new {
107 my ($class) = shift (@_);
108 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
109 push(@$plugoutlist, $class);
110
111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
113
114 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
115
116 if ($self->{'info_only'}) {
117 # don't worry about any options etc
118 return bless $self, $class;
119 }
[32563]120 #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
[32518]121
122 return bless $self, $class;
123}
124
[32520]125# connect here and ensure all tables and databases exist
126sub begin {
127
128 my $self= shift (@_);
129
[32541]130 # The saveas.options
131 #print STDERR "@@@@ PLUGOUT db_pwd: " . $self->{'db_client_pwd'} . "\n";
132 #print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
133 #print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
134 #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
135 #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n";
136
[32520]137 ############ LOAD NECESSARY OPTIONS ###########
[32541]138 #print "@@@ plugout SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
139 #print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
140
[32527]141 print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
[32529]142
143 my $db_params = {
[32560]144 'collection_name' => $ENV{'GSDLCOLLECTION'},
[32578]145 'verbosity' => 1
[32529]146 };
147
148 my $gs_sql = new gssql($db_params);
[32520]149
[32578]150 # try connecting to the mysql db, die if that fails
151 # So don't bother preparing GreenstoneXMLPlugout by calling superclass' begin() yet
[32530]152 if(!$gs_sql->connect_to_db({
153 'db_driver' => $self->{'db_driver'},
154 'db_client_user' => $self->{'db_client_user'},
155 'db_client_pwd' => $self->{'db_client_pwd'},
156 'db_host' => $self->{'db_host'}
157 })
158 )
159 {
[32520]160 # This is fatal for the plugout, let's terminate here
161 # PrintError would already have displayed the warning message on connection fail
162 die("Could not connect to db. Can't proceed.\n");
163 }
[32524]164
[32541]165 my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
[32563]166 my $proc_mode = $self->{'process_mode'};
[32555]167
[32563]168
169 my $success = $gs_sql->use_db($db_name);
170
171 if($success && $proc_mode ne "text_only") {
172 ##print STDERR "@@@@ Ensuring meta table exists\n";
173 $success = $gs_sql->ensure_meta_table_exists();
174 }
175 if($success && $proc_mode ne "meta_only") {
176 ##print STDERR "@@@@ Ensuring fulltxt table exists\n";
177 $success = $gs_sql->ensure_fulltxt_table_exists();
178 }
[32573]179
[32563]180 if(!$success) {
[32536]181 # This is fatal for the plugout, let's terminate here after disconnecting again
182 # PrintError would already have displayed the warning message on load fail
[32578]183 $gs_sql->force_disconnect_from_db(); # disconnect_from_db() will issue a warning on error
[32536]184 die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
[32520]185 }
186
[32529]187 # store the DBI wrapper instance
188 $self->{'gs_sql'} = $gs_sql;
189
[32524]190
[32521]191 # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
192 # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
193 # finally, can call begin on super - important as doc.xml is opened as a group etc
[32523]194
[32526]195 $self->SUPER::begin(@_);
[32520]196}
197
198# disconnect from database here, see inexport.pm
199sub end
200{
201 my $self = shift(@_);
202
[32521]203 # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
[32526]204 $self->SUPER::end(@_);
[32521]205
[32578]206 $self->{'gs_sql'}->finished(); # will disconnect from db if last instance
207 delete $self->{'gs_sql'}; # key gs_sql no longer exists, not just the value being undef
[32520]208}
[32533]209
[32542]210# Produce files called docsql.xml instead of doc.xml
[32536]211sub get_doc_xml_filename {
[32533]212 my $self = shift (@_);
[32536]213 my ($doc_obj) = @_;
214
[32542]215 return "docsql.xml";
[32533]216}
[32542]217
218# overriding to store doc OID as attribute of top level element: <Archive docoid="oid">
219sub output_xml_header {
220 my $self = shift (@_);
221 my ($outhandle, $doc_oid) = @_;
222
223 print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
224 print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
225 print $outhandle "<Archive docoid=\"$doc_oid\">\n";
226}
227
[32543]228# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed?
229# That's only for indexing, not for this step which only generates the content in archives dir
[32518]230sub saveas {
231 my $self = shift (@_);
232 my ($doc_obj, $doc_dir) = @_;
233
[32536]234# print STDERR "\n\n@@@ In saveas\n\n";
[32524]235
[32522]236 my $proc_mode = $self->{'process_mode'};
237
[32521]238 # 1. pre save out and saving debug handle
239
[32523]240 # must call superclass (pre/post) saveas methods, as they handle assoc_files too
[32526]241 my ($docxml_outhandler, $output_file) = $self->SUPER::pre_saveas(@_);
[32523]242
243 $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
244
245 # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
[32543]246
[32520]247
[32543]248 # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files,
249 # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db
[32518]250
[32543]251 # NOTE: if proc_mode == all, then "breadcrumbs" (statements pointing viewer to the sql db
252 # for contents) go into both meta and txt elements of doc.xml (docsql.xml specifically):
[32522]253
[32543]254 # write the INVERSE into doc.xml as to what is written to the SQL db
[32523]255 my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
256 if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
257 $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
258 } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
259 $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
[32518]260 }
[32521]261
[32523]262 # now we've prepared to write out whatever is meant to go into docxml
263 # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
[32543]264 # So: write out the doc xml file, "docsql.xml", for the current document
[32523]265 my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
266 print $docxml_outhandler $section_text;
267
268
[32518]269 # We also write out whatever needs to go into the MySQL database
270 $self->write_meta_and_text($doc_obj);
271
[32520]272
[32521]273 # 3. post save out
[32526]274 #$self->SUPER::post_saveas(@_);
275 $self->SUPER::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
[32521]276
[32523]277
[32521]278 # database connection is closed in end() method
279 # so we don't open and close over and over for each doc during a single build
[32518]280}
281
282
[32520]283# write meta and/or text PER DOC out to DB
[32518]284sub write_meta_and_text {
285 my $self = shift (@_);
286 my ($doc_obj) = @_;
[32531]287 my $doc_oid = $doc_obj->get_OID(); # this method processes a single doc at a time, so it uses the same OID throughout
[32518]288 my $root_section = $doc_obj->get_top_section();
289
[32573]290 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section);
[32521]291}
[32518]292
293# Perl: Reading or Writing to Another Program
294# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
295sub recursive_write_meta_and_text {
[32520]296 my $self = shift (@_);
[32573]297 my ($doc_obj, $doc_oid, $section) = @_;
[32520]298
299 # If section=ROOT, write "root" as section name into table
300 # doc->get_top_section() is the name of the doc root section, which is ""
301 my $section_name = ($section eq "") ? "root" : $section;
[32518]302
303 my $section_ptr = $doc_obj->_lookup_section ($section);
304 return "" unless defined $section_ptr;
305
[32520]306 my $debug_out = $self->{'debug_outhandle'};
[32573]307
308 my $gs_sql = $self->{'gs_sql'};
309 my $proc_mode = $self->{'process_mode'};
310 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
[32520]311
312 foreach my $data (@{$section_ptr->{'metadata'}}) {
313 my $meta_name = $data->[0];
[32536]314 # TODO: does it need to be stored escaped, as it requires unescaping when read back in
315 # from db (unlike for reading back in from doc.xml)
[32524]316 my $escaped_meta_value = &docprint::escape_text($data->[1]);
[32518]317
[32520]318 # Write out the current section's meta to collection db's METADATA table
319
[32573]320 # For each set of values to write to meta table, this next method call will
321 # efficiently execute an insert SQL statement (using a prepared insert statement),
322 # filling in the values
323 # OR if debugging, then it will print the SQL insert statement but not execute it
[32520]324
[32573]325 $gs_sql->insert_row_into_metadata_table($doc_oid, $section_name, $meta_name, $escaped_meta_value, $self->{'debug'}, $debug_out);
[32520]326
327 }
[32518]328 }
[32520]329
[32573]330
331 if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
332
333 my $section_textref = &docprint::escape_textref(\$section_ptr->{'text'});
334
335 # fulltxt column can be SQL NULL. undef value gets written out as NULL:
336 # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
337 # The following will do the SQL insertion
338 # or if debug, the following will print the SQL insert stmt without executing it
339 $gs_sql->insert_row_into_fulltxt_table($doc_oid, $section_name, $section_textref, $self->{'debug'}, $debug_out);
340
[32518]341 }
342
343 # output all subsections: RECURSIVE CALL
344 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
[32573]345 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection");
[32518]346 }
347}
348
349
3501;
Note: See TracBrowser for help on using the repository browser.