root/main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm @ 32537

Revision 32537, 13.6 KB (checked in by ak19, 9 months ago)

First commit to do with reading back in from the SQL DB. This commit introduces the new GreenstoneSQLPlugin for this purpose, which should ideally only be used during buildcol (but its init(), deinit() and read() methods are also called on import.pl). The new plugin works with GreenstoneSQLPlugout which wrote meta and txt to the SQL DB. Lots of TODOs and questions still here, some debug statements too. Also have to run some decisions by Dr Bainbridge. There are many hardcoded values which still have to be parameterised (not always completely sure how) and still have to test the 2 cases of sending just meta and just fulltxt to db. Next commit will tidy some things up.

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugout.pm -- plugout module for writing all or some the
4# Greenstone document format (metadata and/or fulltext) into a (My)SQL db.
5# The rest is then still written out by GreenstoneXMLPlugout as usual.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugout;
29
30use strict;
31no strict 'refs';
32no strict 'subs';
33
34use GreenstoneXMLPlugout;
35use docprint;
36use gssql;
37
38use DBI; # the central package for this plugout
39
40
41# TODO: SIGTERM rollback and disconnect?
42# TODO Q: what about verbosity for debugging
43
44# this plugout does not output xml to a file, but outputs rows into a mysql table
45sub BEGIN {
46    @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
47}
48
49# NOTTODO: die() statements need to be replaced with premature_termination
50# which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW
51# It's fine: the die() stmts all take place before setting up the super class' begin
52
53# TODO: deal with -removeold and everything? Or type out instructions for user
54
55# TODO Q: what is "group" in GreenstoneXMLPlugout?
56# TODO Q: site_name only exists for GS3. What about GS2?
57
58my $process_mode_list =
59    [ { 'name' => "meta_only",
60        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
61      { 'name' => "text_only",
62        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
63      { 'name' => "all",
64        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
65
66my $arguments = [
67       { 'name' => "process_mode",
68     'desc' => "{GreenstoneSQLPlug.process_mode}",
69     'type' => "enum",
70     'list' => $process_mode_list,
71     'deft' => "all",
72     'reqd' => "no",
73     'hiddengli' => "no"} ];
74
75my $options = { 'name'     => "GreenstoneSQLPlugout",
76        'desc'     => "{GreenstoneSQLPlugout.desc}",
77        'abstract' => "no",
78        'inherits' => "yes",
79        'args'     => $arguments };
80
81sub new {
82    my ($class) = shift (@_);
83    my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
84    push(@$plugoutlist, $class);
85
86    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
87    push(@{$hashArgOptLists->{"OptList"}},$options);
88
89    my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
90   
91    if ($self->{'info_only'}) {
92        # don't worry about any options etc
93        return bless $self, $class;
94    }
95    print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
96   
97    return bless $self, $class;
98}
99
100# connect here and ensure all tables and databases exist
101sub begin {
102
103    my $self= shift (@_);
104
105    ########### TODO: these should be set from cmdline/GLI options to plugout #########
106    $self->{'db_driver'} = "mysql";
107    $self->{'site_name'} = "localsite";   
108    $self->{'db_client_user'} = "root";
109    $self->{'db_client_pwd'} = "6reenstone3";
110    $self->{'build_mode'} = "removeold";
111    #$self->{'db_host'} = "127.0.0.1";
112    #$self->{'db_encoding'} = "utf8";
113    #TODO: proc_mode is also a saveas option   
114   
115    ############ LOAD NECESSARY OPTIONS ###########
116    print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
117    #$self->{'collection_name'} = $ENV{'GSDLCOLLECTION'};
118    print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
119
120    my $db_params = {
121    'collection_name' => $ENV{'GSDLCOLLECTION'},   
122    'db_encoding' => $self->{'db_encoding'}
123    #'db_name' => $self->{'site_name'},
124    #'build_mode' => $self->{'build_mode'},
125    };
126
127    my $gs_sql = new gssql($db_params);
128   
129    # try connecting to the mysql db, if that fails it will die
130    # so don't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
131    if(!$gs_sql->connect_to_db({
132    'db_driver' => $self->{'db_driver'},
133    'db_client_user' => $self->{'db_client_user'},
134    'db_client_pwd' => $self->{'db_client_pwd'},
135    'db_host' => $self->{'db_host'}
136                   })
137    )
138    {
139    # This is fatal for the plugout, let's terminate here
140    # PrintError would already have displayed the warning message on connection fail   
141    die("Could not connect to db. Can't proceed.\n");
142    }
143   
144    my $db_name = $self->{'site_name'} || "localsite"; # one database per GS3 site
145    my $build_mode = $self->{'build_mode'} || "removeold";
146    if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) {
147   
148    # This is fatal for the plugout, let's terminate here after disconnecting again
149    # PrintError would already have displayed the warning message on load fail
150    $gs_sql->disconnect_from_db()
151        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
152    die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
153    }
154
155    # prepare the shared/common HANDLES to SQL insert statements that contain placeholders
156    # and which we will reuse repeatedly when actually executing the insert statements
157    my $proc_mode = $self->{'process_mode'};
158    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
159    $self->{'metadata_prepared_insert_statement_handle'} = $gs_sql->prepare_insert_metadata_row_stmthandle();
160    }
161    if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
162    $self->{'fulltxt_prepared_insert_statement_handle'} = $gs_sql->prepare_insert_fulltxt_row_stmthandle();
163    }
164
165    # store the DBI wrapper instance
166    $self->{'gs_sql'} = $gs_sql;
167   
168    print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n";
169    print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n";
170   
171    # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
172    # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
173    # finally, can call begin on super - important as doc.xml is opened as a group etc
174   
175    $self->SUPER::begin(@_);
176}
177
178# disconnect from database here, see inexport.pm
179sub end
180{
181    my $self = shift(@_);
182
183    # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
184    $self->SUPER::end(@_);   
185   
186    $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer
187}
188
189# produce files called docsql-<OID>.xml instead of doc.xml
190sub get_doc_xml_filename {
191    my $self = shift (@_);
192    my ($doc_obj) = @_;
193   
194    my $doc_id = $doc_obj->get_OID();
195    my $filename = $doc_id ? "docsql-".$doc_id.".xml" : "docsql.xml";
196    return $filename;
197}
198   
199# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed
200sub saveas {
201    my $self = shift (@_);
202    my ($doc_obj, $doc_dir) = @_;
203
204#    print STDERR "\n\n@@@ In saveas\n\n";
205   
206    my $proc_mode = $self->{'process_mode'};
207   
208    # 1. pre save out and saving debug handle
209
210    # must call superclass (pre/post) saveas methods, as they handle assoc_files too
211    my ($docxml_outhandler, $output_file) = $self->SUPER::pre_saveas(@_);
212
213    $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
214
215    # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
216
217    # TODO: remove unused old_unused_saveas from GreenstoneXMLPlugout
218   
219   
220    # 2. overriding saving behaviour to do what the superclass does PLUS saving to sql db   
221
222    #NOTE: if proc_mode == all, then "breadcrumbs" go into both meta and txt elements of doc.xml:
223    # statements pointing viewer to the sql db for contents
224   
225    # write the INVERSE into doc.xml as to what is written to the db   
226    my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
227    if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
228    $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
229    } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
230    $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
231    }
232   
233    # now we've prepared to write out whatever is meant to go into docxml
234    # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
235    # So: write out the doc xml file for the current document
236    my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
237    print $docxml_outhandler $section_text;   
238   
239   
240    # We also write out whatever needs to go into the MySQL database
241    $self->write_meta_and_text($doc_obj);
242
243   
244    # 3. post save out
245    #$self->SUPER::post_saveas(@_);
246    $self->SUPER::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
247   
248   
249    # database connection is closed in end() method
250    # so we don't open and close over and over for each doc during a single build
251}
252
253
254# write meta and/or text PER DOC out to DB
255sub write_meta_and_text {
256    my $self = shift (@_);
257    my ($doc_obj) = @_;
258    my $doc_oid = $doc_obj->get_OID(); # this method processes a single doc at a time, so it uses the same OID throughout
259    my $root_section = $doc_obj->get_top_section();
260
261    # load the prepared INSERT statement handles for both tables (can be undef for any table depending on whether meta_only or txt_only are set)
262    my $metadata_table_sth = $self->{'metadata_prepared_insert_statement_handle'};
263    my $fulltxt_table_sth = $self->{'fulltxt_prepared_insert_statement_handle'};
264   
265    $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section, $metadata_table_sth, $fulltxt_table_sth);
266}
267
268# Perl: Reading or Writing to Another Program
269# https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
270sub recursive_write_meta_and_text {
271    my $self = shift (@_);
272    my ($doc_obj, $doc_oid, $section, $metadata_table_sth, $fulltxt_table_sth) = @_;   
273
274    # If section=ROOT, write "root" as section name into table
275    # doc->get_top_section() is the name of the doc root section, which is ""
276    my $section_name = ($section eq "") ? "root" : $section;
277   
278    my $section_ptr = $doc_obj->_lookup_section ($section);
279    return "" unless defined $section_ptr;
280
281    my $debug_out = $self->{'debug_outhandle'};
282#    print STDERR "#### Meta stmt: " . $metadata_table_sth->{'Statement'} . "\n";
283#    print STDERR "#### Full stmt: " . $fulltxt_table_sth->{'Statement'} . "\n";
284   
285    #my $proc_mode = $self->{'process_mode'};
286    #if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
287    if($metadata_table_sth) { # meta insert statement handle will be undef if not writing meta
288   
289    foreach my $data (@{$section_ptr->{'metadata'}}) {
290        my $meta_name = $data->[0];
291        # TODO: does it need to be stored escaped, as it requires unescaping when read back in
292        # from db (unlike for reading back in from doc.xml)
293        my $escaped_meta_value = &docprint::escape_text($data->[1]);
294
295        # Write out the current section's meta to collection db's METADATA table       
296       
297        # for each set of values to write to meta table, execute the prepared statement, filling in the values
298
299        if($self->{'debug'}) {
300        # just print the statement we were going to execute
301
302        print $debug_out $metadata_table_sth->{'Statement'} . "($doc_oid, $section_name, $meta_name, $escaped_meta_value)\n";
303        }
304        else {
305       
306        $metadata_table_sth->execute($doc_oid, $section_name, $meta_name, $escaped_meta_value)
307            || warn ("Unable to write metadata row to db:\n\tOID $doc_oid, section $section_name,\n\tmeta name: $meta_name, val: $escaped_meta_value");
308        # Execution failure will print out info anyway: since db connection sets PrintError
309        }
310    }
311    }
312   
313    #if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
314    if($fulltxt_table_sth) { # fulltxt insert statement handle will be undef if not writing fulltxt
315
316    if($self->{'debug'}) {
317        # just print the statement we were going to execute, minus the fulltxt value
318        my $txt_repr = $section_ptr->{'text'} ? "<TXT>" : "NULL";
319        print $debug_out $fulltxt_table_sth->{'Statement'} . "($doc_oid, $section_name, $txt_repr)\n";
320    } else {
321        my $section_text = &docprint::escape_text($section_ptr->{'text'});
322       
323        # fulltxt column can be SQL NULL. undef value gets written out as NULL:
324        # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
325       
326        # Write out the current section's text to collection db's FULLTeXT table
327        $fulltxt_table_sth->execute($doc_oid, $section_name, $section_text)
328        || warn ("Unable to write fulltxt row to db for row:\n\tOID $doc_oid, section $section_name");
329        # Execution failure will print out info anyway: since db connection sets PrintError
330    }
331    }
332   
333    # output all subsections: RECURSIVE CALL
334    foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
335    $self->recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection", $metadata_table_sth, $fulltxt_table_sth);
336    }
337}
338
339
3401;
Note: See TracBrowser for help on using the browser.