root/main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm @ 32563

Revision 32563, 15.3 KB (checked in by ak19, 9 months ago)

1. Overhaul of GreenstoneSQLPlugs to handle removeold and incremental delete correctly. And now code also automatically handles 'non-incremental delete' (see mention in ArchivesInfPlugin?). The new version no longer does lazy loading for getting the sql db connection in the GS SQL Plugin, as now the connection needs to be active since the start of the plugin to run SQL delete statements on remove_old. So the db connection code for the GS SQL plugin has moved back into its init() method. Lots of changes to gssql.pm (and some flow on effects to the GS SQL Plugout) as when database tables exist and need to be created have changed. 2. Undoing most of the changes of changeset 32555 since we're doing incremental delete and removeold differently and in the correct way now when using the GreenstoneSQLPlugs.

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugout.pm -- plugout module for writing all or some the
4# Greenstone document format (metadata and/or fulltext) into a (My)SQL db.
5# The rest is then still written out by GreenstoneXMLPlugout as usual.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugout;
29
30use strict;
31no strict 'refs';
32no strict 'subs';
33
34use GreenstoneXMLPlugout;
35use docprint;
36use gssql;
37
38use DBI; # the central package for this plugout
39
40
41# TODO: SIGTERM rollback and disconnect?
42# TODO Q: what about verbosity for debugging, instead of current situation of printing out upon debug set at the expense of writing to db
43# TODO Q: introduced site_name param to plugins and plugouts. Did I do it right? And should they have hiddengli = "yes"
44
45# this plugout does not output the metadata and/or fulltxt xml to a file,
46# but outputs rows into a mysql table for metadata and/or a table for fulltxt
47sub BEGIN {
48    @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
49}
50
51# NOTTODO: die() statements need to be replaced with premature_termination
52# which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW
53# It's fine: the die() stmts all take place before setting up the super class' begin
54
55# TODO Q: about build_mode: how to detect removeold. Now handled by
56#   GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin.
57# TODO: deal with -removeold and everything? Or type out instructions for user
58
59# TODO Q: what is "group" in GreenstoneXMLPlugout?
60
61my $process_mode_list =
62    [ { 'name' => "meta_only",
63        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
64      { 'name' => "text_only",
65        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
66      { 'name' => "all",
67        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
68
69# The following are the saveas.options:
70my $arguments = [
71    { 'name' => "process_mode",
72      'desc' => "{GreenstoneSQLPlug.process_mode}",
73      'type' => "enum",
74      'list' => $process_mode_list,
75      'deft' => "all",
76      'reqd' => "no",
77      'hiddengli' => "no"},
78    { 'name' => "db_driver",
79      'desc' => "{GreenstoneSQLPlug.db_driver}",
80      'type' => "string",   
81      'deft' => "mysql",
82      'reqd' => "yes"},
83    { 'name' => "db_client_user",
84      'desc' => "{GreenstoneSQLPlug.db_client_user}",
85      'type' => "string",   
86      'deft' => "root",
87      'reqd' => "yes"},
88    { 'name' => "db_client_pwd",
89      'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
90      'type' => "string",
91      'deft' => "",
92      'reqd' => "yes"}, # pwd required?
93    { 'name' => "db_host",
94      'desc' => "{GreenstoneSQLPlug.db_host}",
95      'type' => "string",
96      'deft' => "127.0.0.1",
97      'reqd' => "yes"}
98    ];
99
100my $options = { 'name'     => "GreenstoneSQLPlugout",
101        'desc'     => "{GreenstoneSQLPlugout.desc}",
102        'abstract' => "no",
103        'inherits' => "yes",
104        'args'     => $arguments };
105
106sub new {
107    my ($class) = shift (@_);
108    my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
109    push(@$plugoutlist, $class);
110
111    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112    push(@{$hashArgOptLists->{"OptList"}},$options);
113
114    my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
115   
116    if ($self->{'info_only'}) {
117        # don't worry about any options etc
118        return bless $self, $class;
119    }
120    #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
121   
122    return bless $self, $class;
123}
124
125# connect here and ensure all tables and databases exist
126sub begin {
127
128    my $self= shift (@_);
129
130    # The saveas.options
131    #print STDERR "@@@@ PLUGOUT db_pwd: " . $self->{'db_client_pwd'} . "\n";
132    #print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
133    #print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
134    #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
135    #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n";
136
137    ############ LOAD NECESSARY OPTIONS ###########
138    #print "@@@ plugout SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
139    #print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
140
141    print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
142
143    my $db_params = {
144    'collection_name' => $ENV{'GSDLCOLLECTION'},
145    'verbosity' => 1   
146    };
147
148    my $gs_sql = new gssql($db_params);
149   
150    # try connecting to the mysql db, if that fails it will die
151    # so don't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
152    if(!$gs_sql->connect_to_db({
153    'db_driver' => $self->{'db_driver'},
154    'db_client_user' => $self->{'db_client_user'},
155    'db_client_pwd' => $self->{'db_client_pwd'},
156    'db_host' => $self->{'db_host'}
157                   })
158    )
159    {
160    # This is fatal for the plugout, let's terminate here
161    # PrintError would already have displayed the warning message on connection fail   
162    die("Could not connect to db. Can't proceed.\n");
163    }
164   
165    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
166    my $proc_mode = $self->{'process_mode'};
167   
168
169    my $success = $gs_sql->use_db($db_name);
170   
171    if($success && $proc_mode ne "text_only") {
172    ##print STDERR "@@@@ Ensuring meta table exists\n";
173    $success = $gs_sql->ensure_meta_table_exists();
174    }   
175    if($success && $proc_mode ne "meta_only") {
176    ##print STDERR "@@@@ Ensuring fulltxt table exists\n";
177    $success = $gs_sql->ensure_fulltxt_table_exists();
178    }
179    #if(!$gs_sql->load_db_and_tables($db_name, $proc_mode)) {
180    if(!$success) {
181    # This is fatal for the plugout, let's terminate here after disconnecting again
182    # PrintError would already have displayed the warning message on load fail
183    $gs_sql->disconnect_from_db()
184        || warn("Unable to disconnect from database.\n");
185    die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
186    }
187
188    # prepare the shared/common HANDLES to SQL insert statements that contain placeholders
189    # and which we will reuse repeatedly when actually executing the insert statements
190   
191    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
192    $self->{'metadata_prepared_insert_statement_handle'} = $gs_sql->prepare_insert_metadata_row_stmthandle();
193    }
194    if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
195    $self->{'fulltxt_prepared_insert_statement_handle'} = $gs_sql->prepare_insert_fulltxt_row_stmthandle();
196    }
197
198    # store the DBI wrapper instance
199    $self->{'gs_sql'} = $gs_sql;
200   
201    ##print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n";
202    ##print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n";
203   
204    # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
205    # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
206    # finally, can call begin on super - important as doc.xml is opened as a group etc
207   
208    $self->SUPER::begin(@_);
209}
210
211# disconnect from database here, see inexport.pm
212sub end
213{
214    my $self = shift(@_);
215
216    # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
217    $self->SUPER::end(@_);   
218   
219    $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer
220}
221
222# Produce files called docsql.xml instead of doc.xml
223sub get_doc_xml_filename {
224    my $self = shift (@_);
225    my ($doc_obj) = @_;
226   
227    return "docsql.xml";
228}
229
230# overriding to store doc OID as attribute of top level element: <Archive docoid="oid">
231sub output_xml_header {
232    my $self = shift (@_);
233    my ($outhandle, $doc_oid) = @_;
234
235    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
236    print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
237    print $outhandle "<Archive docoid=\"$doc_oid\">\n";
238}
239 
240# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed?
241# That's only for indexing, not for this step which only generates the content in archives dir
242sub saveas {
243    my $self = shift (@_);
244    my ($doc_obj, $doc_dir) = @_;
245
246#    print STDERR "\n\n@@@ In saveas\n\n";
247   
248    my $proc_mode = $self->{'process_mode'};
249   
250    # 1. pre save out and saving debug handle
251
252    # must call superclass (pre/post) saveas methods, as they handle assoc_files too
253    my ($docxml_outhandler, $output_file) = $self->SUPER::pre_saveas(@_);
254
255    $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
256
257    # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
258   
259   
260    # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files,
261    # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db   
262
263    # NOTE: if proc_mode == all, then "breadcrumbs" (statements pointing viewer to the sql db
264    # for contents) go into both meta and txt elements of doc.xml (docsql.xml specifically):
265   
266    # write the INVERSE into doc.xml as to what is written to the SQL db   
267    my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
268    if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
269    $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
270    } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
271    $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
272    }
273   
274    # now we've prepared to write out whatever is meant to go into docxml
275    # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
276    # So: write out the doc xml file, "docsql.xml", for the current document
277    my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
278    print $docxml_outhandler $section_text;   
279   
280   
281    # We also write out whatever needs to go into the MySQL database
282    $self->write_meta_and_text($doc_obj);
283
284   
285    # 3. post save out
286    #$self->SUPER::post_saveas(@_);
287    $self->SUPER::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
288   
289   
290    # database connection is closed in end() method
291    # so we don't open and close over and over for each doc during a single build
292}
293
294
295# write meta and/or text PER DOC out to DB
296sub write_meta_and_text {
297    my $self = shift (@_);
298    my ($doc_obj) = @_;
299    my $doc_oid = $doc_obj->get_OID(); # this method processes a single doc at a time, so it uses the same OID throughout
300    my $root_section = $doc_obj->get_top_section();
301
302    # load the prepared INSERT statement handles for both tables (can be undef for any table depending on whether meta_only or txt_only are set)
303    my $metadata_table_sth = $self->{'metadata_prepared_insert_statement_handle'};
304    my $fulltxt_table_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
305   
306    $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section, $metadata_table_sth, $fulltxt_table_sth);
307}
308
309# Perl: Reading or Writing to Another Program
310# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
311sub recursive_write_meta_and_text {
312    my $self = shift (@_);
313    my ($doc_obj, $doc_oid, $section, $metadata_table_sth, $fulltxt_table_sth) = @_;   
314
315    # If section=ROOT, write "root" as section name into table
316    # doc->get_top_section() is the name of the doc root section, which is ""
317    my $section_name = ($section eq "") ? "root" : $section;
318   
319    my $section_ptr = $doc_obj->_lookup_section ($section);
320    return "" unless defined $section_ptr;
321
322    my $debug_out = $self->{'debug_outhandle'};
323#    print STDERR "#### Meta stmt: " . $metadata_table_sth->{'Statement'} . "\n";
324#    print STDERR "#### Full stmt: " . $fulltxt_table_sth->{'Statement'} . "\n";
325   
326    #my $proc_mode = $self->{'process_mode'};
327    #if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
328    if($metadata_table_sth) { # meta insert statement handle will be undef if not writing meta
329   
330    foreach my $data (@{$section_ptr->{'metadata'}}) {
331        my $meta_name = $data->[0];
332        # TODO: does it need to be stored escaped, as it requires unescaping when read back in
333        # from db (unlike for reading back in from doc.xml)
334        my $escaped_meta_value = &docprint::escape_text($data->[1]);
335
336        # Write out the current section's meta to collection db's METADATA table       
337       
338        # for each set of values to write to meta table, execute the prepared statement, filling in the values
339
340        if($self->{'debug'}) {
341        # just print the statement we were going to execute
342
343        print $debug_out $metadata_table_sth->{'Statement'} . "($doc_oid, $section_name, $meta_name, $escaped_meta_value)\n";
344        }
345        else {
346       
347        $metadata_table_sth->execute($doc_oid, $section_name, $meta_name, $escaped_meta_value)
348            || warn ("Unable to write metadata row to db:\n\tOID $doc_oid, section $section_name,\n\tmeta name: $meta_name, val: $escaped_meta_value");
349        # Execution failure will print out info anyway: since db connection sets PrintError
350        }
351    }
352    }
353   
354    #if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
355    if($fulltxt_table_sth) { # fulltxt insert statement handle will be undef if not writing fulltxt
356
357    if($self->{'debug'}) {
358        # just print the statement we were going to execute, minus the fulltxt value
359        my $txt_repr = $section_ptr->{'text'} ? "<TXT>" : "NULL";
360        print $debug_out $fulltxt_table_sth->{'Statement'} . "($doc_oid, $section_name, $txt_repr)\n";
361    } else {
362        my $section_text = &docprint::escape_text($section_ptr->{'text'});
363       
364        # fulltxt column can be SQL NULL. undef value gets written out as NULL:
365        # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
366       
367        # Write out the current section's text to collection db's FULLTeXT table
368        $fulltxt_table_sth->execute($doc_oid, $section_name, $section_text)
369        || warn ("Unable to write fulltxt row to db for row:\n\tOID $doc_oid, section $section_name");
370        # Execution failure will print out info anyway: since db connection sets PrintError
371    }
372    }
373   
374    # output all subsections: RECURSIVE CALL
375    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
376    $self->recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection", $metadata_table_sth, $fulltxt_table_sth);
377    }
378}
379
380
3811;
Note: See TracBrowser for help on using the browser.