source: main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm@ 32580

Last change on this file since 32580 was 32580, checked in by ak19, 5 years ago
  1. support for port param when connecting to SQL DB. 2. GS SQL Plugout now also uses the verbosity member variable when instantiating the gssql object. 3. Since the DBI object has PrintError set to 1 on connection (and ShowErrorStatement set to 1 to for more verbosity), which means an informative message is always printed on error or warning, there's no need for me to right warning statements everywhere when a db statement/call fails. Removed these redundant warnings. 4. Don't want GS XML Plugout's debug outhandle passed to the two gssql::insert methods, as we don't want them to write debug information to the debug handle. That should only be for the XML stuff (whether groups on or not), and the debug outhandle can moreover be set to the XSLT writer, concerning which makes it makes even less sense for gssql to output info and error debug statements into there. gssql now sticks to STDERR for debug information.
File size: 13.3 KB
Line 
1###########################################################################
2#
3# GreenstoneSQLPlugout.pm -- plugout module for writing all or some the
4# Greenstone document format (metadata and/or fulltext) into a (My)SQL db.
5# The rest is then still written out by GreenstoneXMLPlugout as usual.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugout;
29
30use strict;
31no strict 'refs';
32no strict 'subs';
33
34use GreenstoneXMLPlugout;
35use docprint;
36use gssql;
37
38use DBI; # the central package for this plugout
39
40
41# TODO: SIGTERM rollback and disconnect?
42# TODO Q: what about verbosity for debugging, instead of current situation of printing out upon debug set at the expense of writing to db
43# TODO Q: introduced site_name param to plugins and plugouts. Did I do it right? And should they have hiddengli = "yes"
44# Did I do the pass by ref in docprint's escape and unescape textref functions correctly, and how they're called here?
45# Any more optimisation I can do around this?
46
47# this plugout does not output the metadata and/or fulltxt xml to a file,
48# but outputs rows into a mysql table for metadata and/or a table for fulltxt
49sub BEGIN {
50 @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
51}
52
53# + NOTTODO: die() statements need to be replaced with premature_termination
54# which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW
55# It's fine: the die() stmts all take place before setting up the super class' begin
56
57# + TODO Q: about build_mode: how to detect removeold. Now handled by
58# GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin.
59# + TODO: deal with -removeold and everything? Or type out instructions for user
60
61# + TODO Q: what is "group" in GreenstoneXMLPlugout?
62
63my $process_mode_list =
64 [ { 'name' => "meta_only",
65 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
66 { 'name' => "text_only",
67 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
68 { 'name' => "all",
69 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
70
71# The following are the saveas.options:
72my $arguments = [
73 { 'name' => "process_mode",
74 'desc' => "{GreenstoneSQLPlug.process_mode}",
75 'type' => "enum",
76 'list' => $process_mode_list,
77 'deft' => "all",
78 'reqd' => "no",
79 'hiddengli' => "no"},
80 { 'name' => "db_driver",
81 'desc' => "{GreenstoneSQLPlug.db_driver}",
82 'type' => "string",
83 'deft' => "mysql",
84 'reqd' => "yes"},
85 { 'name' => "db_client_user",
86 'desc' => "{GreenstoneSQLPlug.db_client_user}",
87 'type' => "string",
88 'deft' => "root",
89 'reqd' => "yes"},
90 { 'name' => "db_client_pwd",
91 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
92 'type' => "string",
93 'deft' => "",
94 'reqd' => "yes"}, # pwd required?
95 { 'name' => "db_host",
96 'desc' => "{GreenstoneSQLPlug.db_host}",
97 'type' => "string",
98 'deft' => "127.0.0.1",
99 'reqd' => "yes"}
100 ];
101
102my $options = { 'name' => "GreenstoneSQLPlugout",
103 'desc' => "{GreenstoneSQLPlugout.desc}",
104 'abstract' => "no",
105 'inherits' => "yes",
106 'args' => $arguments };
107
108sub new {
109 my ($class) = shift (@_);
110 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
111 push(@$plugoutlist, $class);
112
113 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
114 push(@{$hashArgOptLists->{"OptList"}},$options);
115
116 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
117
118 if ($self->{'info_only'}) {
119 # don't worry about any options etc
120 return bless $self, $class;
121 }
122 #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
123
124 return bless $self, $class;
125}
126
127# connect here and ensure all tables and databases exist
128sub begin {
129
130 my $self= shift (@_);
131
132 # The saveas.options
133 #print STDERR "@@@@ PLUGOUT db_pwd: " . $self->{'db_client_pwd'} . "\n";
134 #print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
135 #print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
136 #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
137 #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n";
138
139 ############ LOAD NECESSARY OPTIONS ###########
140 #print "@@@ plugout SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
141 #print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
142
143 print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
144
145 my $db_params = {
146 'collection_name' => $ENV{'GSDLCOLLECTION'},
147 'verbosity' => $self->{'verbosity'} || 0
148 };
149
150 my $gs_sql = new gssql($db_params);
151
152 # try connecting to the mysql db, die if that fails
153 # So don't bother preparing GreenstoneXMLPlugout by calling superclass' begin() yet
154 if(!$gs_sql->connect_to_db({
155 'db_driver' => $self->{'db_driver'},
156 'db_client_user' => $self->{'db_client_user'},
157 'db_client_pwd' => $self->{'db_client_pwd'},
158 'db_host' => $self->{'db_host'}
159 })
160 )
161 {
162 # This is fatal for the plugout, let's terminate here
163 # PrintError would already have displayed the warning message on connection fail
164 die("Could not connect to db. Can't proceed.\n");
165 }
166
167 my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
168 my $proc_mode = $self->{'process_mode'};
169
170
171 my $success = $gs_sql->use_db($db_name);
172
173 if($success && $proc_mode ne "text_only") {
174 ##print STDERR "@@@@ Ensuring meta table exists\n";
175 $success = $gs_sql->ensure_meta_table_exists();
176 }
177 if($success && $proc_mode ne "meta_only") {
178 ##print STDERR "@@@@ Ensuring fulltxt table exists\n";
179 $success = $gs_sql->ensure_fulltxt_table_exists();
180 }
181
182 if(!$success) {
183 # This is fatal for the plugout, let's terminate here after disconnecting again
184 # PrintError would already have displayed the warning message on load fail
185 $gs_sql->force_disconnect_from_db(); # disconnect_from_db() will issue a warning on error
186 die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
187 }
188
189 # store the DBI wrapper instance
190 $self->{'gs_sql'} = $gs_sql;
191
192
193 # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
194 # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
195 # finally, can call begin on super - important as doc.xml is opened as a group etc
196
197 $self->SUPER::begin(@_);
198}
199
200# disconnect from database here, see inexport.pm
201sub end
202{
203 my $self = shift(@_);
204
205 # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
206 $self->SUPER::end(@_);
207
208 $self->{'gs_sql'}->finished(); # will disconnect from db if last instance
209 delete $self->{'gs_sql'}; # key gs_sql no longer exists, not just the value being undef
210}
211
212# Produce files called docsql.xml instead of doc.xml
213sub get_doc_xml_filename {
214 my $self = shift (@_);
215 my ($doc_obj) = @_;
216
217 return "docsql.xml";
218}
219
220# overriding to store doc OID as attribute of top level element: <Archive docoid="oid">
221sub output_xml_header {
222 my $self = shift (@_);
223 my ($outhandle, $doc_oid) = @_;
224
225 print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
226 print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
227 print $outhandle "<Archive docoid=\"$doc_oid\">\n";
228}
229
230# + X TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed?
231# That's only for indexing, not for this step which only generates the content in archives dir
232sub saveas {
233 my $self = shift (@_);
234 my ($doc_obj, $doc_dir) = @_;
235
236# print STDERR "\n\n@@@ In saveas\n\n";
237
238 my $proc_mode = $self->{'process_mode'};
239
240 # 1. pre save out and saving debug handle
241
242 # must call superclass (pre/post) saveas methods, as they handle assoc_files too
243 my ($docxml_outhandler, $output_file) = $self->SUPER::pre_saveas(@_);
244
245 $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
246
247 # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
248
249
250 # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files,
251 # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db
252
253 # NOTE: if proc_mode == all, then "breadcrumbs" (statements pointing viewer to the sql db
254 # for contents) go into both meta and txt elements of doc.xml (docsql.xml specifically):
255
256 # write the INVERSE into doc.xml as to what is written to the SQL db
257 my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
258 if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
259 $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
260 } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
261 $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
262 }
263
264 # now we've prepared to write out whatever is meant to go into docxml
265 # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
266 # So: write out the doc xml file, "docsql.xml", for the current document
267 my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
268 print $docxml_outhandler $section_text;
269
270
271 # We also write out whatever needs to go into the MySQL database
272 $self->write_meta_and_text($doc_obj);
273
274
275 # 3. post save out
276 #$self->SUPER::post_saveas(@_);
277 $self->SUPER::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
278
279
280 # database connection is closed in end() method
281 # so we don't open and close over and over for each doc during a single build
282}
283
284
285# write meta and/or text PER DOC out to DB
286sub write_meta_and_text {
287 my $self = shift (@_);
288 my ($doc_obj) = @_;
289 my $doc_oid = $doc_obj->get_OID(); # this method processes a single doc at a time, so it uses the same OID throughout
290 my $root_section = $doc_obj->get_top_section();
291
292 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section);
293}
294
295# Perl: Reading or Writing to Another Program
296# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
297sub recursive_write_meta_and_text {
298 my $self = shift (@_);
299 my ($doc_obj, $doc_oid, $section) = @_;
300
301 # If section=ROOT, write "root" as section name into table
302 # doc->get_top_section() is the name of the doc root section, which is ""
303 my $section_name = ($section eq "") ? "root" : $section;
304
305 my $section_ptr = $doc_obj->_lookup_section ($section);
306 return "" unless defined $section_ptr;
307
308 my $debug_out = $self->{'debug_outhandle'};
309
310 my $gs_sql = $self->{'gs_sql'};
311 my $proc_mode = $self->{'process_mode'};
312 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
313
314 foreach my $data (@{$section_ptr->{'metadata'}}) {
315 my $meta_name = $data->[0];
316 # TODO: does it need to be stored escaped, as it requires unescaping when read back in
317 # from db (unlike for reading back in from doc.xml)
318 my $escaped_meta_value = &docprint::escape_text($data->[1]);
319
320 # Write out the current section's meta to collection db's METADATA table
321
322 # For each set of values to write to meta table, this next method call will
323 # efficiently execute an insert SQL statement (using a prepared insert statement),
324 # filling in the values
325 # OR if debugging, then it will print the SQL insert statement but not execute it
326
327 $gs_sql->insert_row_into_metadata_table($doc_oid, $section_name, $meta_name, $escaped_meta_value, $self->{'debug'});
328 }
329 }
330
331
332 if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
333
334 my $section_textref = &docprint::escape_textref(\$section_ptr->{'text'});
335
336 # fulltxt column can be SQL NULL. undef value gets written out as NULL:
337 # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
338 # The following will do the SQL insertion
339 # or if debug, the following will print the SQL insert stmt without executing it
340 $gs_sql->insert_row_into_fulltxt_table($doc_oid, $section_name, $section_textref, $self->{'debug'});
341
342 }
343
344 # output all subsections: RECURSIVE CALL
345 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
346 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection");
347 }
348}
349
350
3511;
Note: See TracBrowser for help on using the repository browser.