root/main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm @ 32544

Revision 32544, 17.6 KB (checked in by ak19, 9 months ago)

1. GreenstoneSQLPlugin: now sub read() calls the new lazy_get_gssql() method to only connect the first time and only if buildcol (when the processor var is a buildproc), so we don't connect in init() anymore as that got called during import.pl as well buildcol, whereas we only want to do connect/disconnet from DB stuff in this plugIN during buildcol.pl. 2. GSSQLPlugin and gssql now can delete entries from the SQL database. This can't be tested to work properly until GS SQL PlugOUT doesn't always assume removeold. At present it always does removeold: clearing the db even when incremental-import.pl is called. When there are no NEW docs in inc-import, the tables are cleared and nothing new is added, so can't test incremental behaviour during buildcol.pl/GS SQL PlugIN then either.

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
42# + Ask about docsql naming convention adopted to identify OID. Better way?
43# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
44# + Startup parameters (except removeold/build_mode)
45# - how do we detect we're to do removeold during plugout in import.pl phase
46# - incremental building: where do we need to add code to delete rows from our sql table after
47# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
48# - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
49# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
50# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
51# - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
52# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order
53
54# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
55# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
56
57# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
58# Discuss the plugin/plugout parameters.
59
60# TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
61
62# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
63# is still written out to doc.xml (docsql .xml), that will be processed as usual,
64# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
65# is written out by GreenstoneSQLPlugout into the SQL db).
66
67
68sub BEGIN {
69    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
70}
71
72# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
73# So we won't have a process exp conflict here.
74# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
75#   - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
76#   this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
77#   - the root element Archive now has a docoid attribute: <Archive docoid="OID">
78sub get_default_process_exp {
79    my $self = shift (@_);
80
81    return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
82    #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename
83}
84
85my $process_mode_list =
86    [ { 'name' => "meta_only",
87        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
88      { 'name' => "text_only",
89        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
90      { 'name' => "all",
91        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
92
93my $arguments =
94    [ { 'name' => "process_exp",
95    'desc' => "{BaseImporter.process_exp}",
96    'type' => "regexp",
97    'deft' => &get_default_process_exp(),
98    'reqd' => "no" },
99      { 'name' => "process_mode",
100    'desc' => "{GreenstoneSQLPlug.process_mode}",
101    'type' => "enum",
102    'list' => $process_mode_list,
103    'deft' => "all",
104    'reqd' => "no"},
105      { 'name' => "db_driver",
106    'desc' => "{GreenstoneSQLPlug.db_driver}",
107    'type' => "string",
108    'deft' => "mysql",
109    'reqd' => "yes"},
110      { 'name' => "db_client_user",
111    'desc' => "{GreenstoneSQLPlug.db_client_user}",
112    'type' => "string",
113    'deft' => "root",
114    'reqd' => "yes"},
115      { 'name' => "db_client_pwd",
116    'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
117    'type' => "string",
118    'deft' => "",
119    'reqd' => "yes"}, # pwd required?
120      { 'name' => "db_host",
121    'desc' => "{GreenstoneSQLPlug.db_host}",
122    'type' => "string",
123    'deft' => "127.0.0.1",
124    'reqd' => "yes"},
125      { 'name' => "db_encoding",
126    'desc' => "{GreenstoneSQLPlug.db_encoding}",
127    'type' => "string",
128    'deft' => "utf8",
129    'reqd' => "yes"}
130    ];
131
132my $options = { 'name'     => "GreenstoneSQLPlugin",
133        'desc'     => "{GreenstoneSQLPlugin.desc}",
134        'abstract' => "no",
135        'inherits' => "yes",
136            'args'     => $arguments };
137
138
139# TODO: For on cancel, add a SIGTERM handler or so to call end()
140# or to explicitly call gs_sql->close_connection if $gs_sql def
141
142sub new {
143    my ($class) = shift (@_);
144    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
145    push(@$pluginlist, $class);
146
147    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
148    push(@{$hashArgOptLists->{"OptList"}},$options);
149
150    my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
151
152   
153    #return bless $self, $class;
154    $self = bless $self, $class;
155    if ($self->{'info_only'}) {
156    # If running pluginfo, we don't need to go further.
157    return $self;
158    }
159
160    # do anything else that needs to be done here when not pluginfo
161    #$self->{'delete_docids'} = (); # list of doc oids to delete during deinit()
162   
163    return $self;
164}
165
166sub xml_start_tag {
167    my $self = shift(@_);
168    my ($expat, $element) = @_;
169
170    my $outhandle = $self->{'outhandle'};
171   
172    $self->{'element'} = $element;
173    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
174    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
175    # contains the tag)
176
177    # Don't access %_{'docoid'} directly: keep getting a warning message to
178    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
179    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
180    my %attr_hash = %_; # right way, see OAIPlugin.pm
181    $self->{'doc_oid'} = $attr_hash{'docoid'}; 
182    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
183        if $self->{'verbosity'} > 2;
184
185    }
186    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
187    $self->SUPER::xml_start_tag(@_);
188    }
189}
190
191# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
192
193# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
194# the doc_obj in memory is processed (indexed) and then made undef.
195# So we have to work with doc_obj before superclass close_document() is finished.
196sub close_document {
197    my $self = shift(@_);
198   
199    my $outhandle = $self->{'outhandle'};
200    my $doc_obj = $self->{'doc_obj'};
201    # sub read() will make the db connection setting $self->{'gs_sql'} once: the first time read()
202    # is called on the GS SQLPlugin instance.
203    my $gs_sql = $self->{'gs_sql'} || return; # $self->lazy_get_gssql(); # won't want to call lazy_get_gssql() if close_doc called during (incr-)import.pl, only during buildcol.pl
204
205    # TODO: return statement skips "dummy" pass. Should we skip it or not?
206    # If we don't return, gs_sql is not set for dummy pass...
207
208    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
209    print $outhandle "++++ OID of document (meta|text) to be read in from DB: $oid\n"
210    if $self->{'verbosity'} > 2;
211   
212    # For now, we have access to doc_obj (until just before super::close_document() terminates)
213
214    # no need to call $self->{'doc_obj'}->set_OID($oid);
215    # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata
216    # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata
217    # Either way, Identifier meta will be read into the docobj automatically with other meta.
218
219    my $proc_mode = $self->{'process_mode'};
220    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
221    # read in meta for the collection (i.e. select * from <col>_metadata table
222
223    my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
224    print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
225        if $self->{'verbosity'} > 2;
226
227    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
228    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
229    while( my @row = $sth->fetchrow_array() ) {     
230        #print $outhandle "row: @row\n";
231        my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
232
233        # get rid of the artificial "root" introduced in section id when saving to sql db
234        $sid =~ s@^root@@;
235        $sid = $doc_obj->get_top_section() unless $sid;
236        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
237        if $self->{'verbosity'} > 2;
238       
239        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
240        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
241    }
242    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
243        if $self->{'verbosity'} > 2;
244    }
245   
246    if($proc_mode eq "all" || $proc_mode eq "text_only") {
247    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
248
249    my $fulltxt_table = $gs_sql->get_fulltext_table_name();
250   
251   
252    my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
253    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2;
254
255    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
256        if $self->{'verbosity'} > 2;
257    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
258
259        # get rid of the artificial "root" introduced in section id when saving to sql db
260        #$sid =~ s@^root@@;
261        $sid = $doc_obj->get_top_section() if ($sid eq "root");
262        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
263        if $self->{'verbosity'} > 2;
264
265        # TODO - pass by ref?
266        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
267        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
268    }   
269    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
270        if $self->{'verbosity'} > 2;
271    }
272
273   
274    # don't forget to clean up on close() in superclass
275    # It will get the doc_obj indexed then make it undef
276    $self->SUPER::close_document(@_);
277}
278
279
280# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl.
281# During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
282# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
283# Lazy connection.
284
285# Call init() not begin() because there can be multiple plugin passes
286# and init() should be called before all passes:
287# one for doc level and another for section level indexing
288# This way, we can connect to the SQL database once per buildcol run.
289#sub init {
290#    my ($self) = shift (@_);
291#    print STDERR "@@@@@@@@@@ INIT CALLED\n";
292   
293#    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
294
295
296sub lazy_get_gssql {
297    my $self = shift(@_);
298
299    # if we failed to successfully connect once before, don't bother attempting to connect again
300    #return undef if(defined $self->{'failed'}); # plugin/process would have terminated with die()
301                                  # if we couldn't succeed connecting on any connection attempt
302   
303    return $self->{'gs_sql'} if($self->{'gs_sql'});
304
305    # assume we'll fail to connect
306    $self->{'failed'} = 1;
307
308    print STDERR "@@@@@@@@@@ LAZY CONNECT CALLED\n";
309   
310    ####################
311#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
312#    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
313
314#    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
315#    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
316#    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
317#    print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n";
318#    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
319    ####################
320   
321    my $gs_sql = new gssql({
322    'collection_name' => $ENV{'GSDLCOLLECTION'},   
323    'db_encoding' => $self->{'db_encoding'}
324               }
325    );
326
327    # try connecting to the mysql db, if that fails it will die
328    if(!$gs_sql->connect_to_db({
329    'db_driver' => $self->{'db_driver'},
330    'db_client_user' => $self->{'db_client_user'},
331    'db_client_pwd' => $self->{'db_client_pwd'},
332    'db_host' => $self->{'db_host'}
333                   })
334    )
335    {
336    # This is fatal for the plugout, let's terminate here
337    # PrintError would already have displayed the warning message on connection fail   
338    die("Could not connect to db. Can't proceed.\n");
339    }
340   
341    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
342    #my $build_mode = $self->{'build_mode'} || "removeold";
343
344    # the db and its tables should exist. Attempt to use the db:
345    if(!$gs_sql->use_db($db_name)) {
346   
347    # This is fatal for the plugout, let's terminate here after disconnecting again
348    # PrintError would already have displayed the warning message on load fail
349    $gs_sql->disconnect_from_db()
350        || warn("Unable to disconnect from database.\n");
351    die("Could not use db $db_name. Can't proceed.\n");
352    }
353
354    #undef $self->{'failed'};
355   
356    # store db handle now that we're connected
357    $self->{'gs_sql'} = $gs_sql;
358    return $gs_sql;
359   
360}
361
362# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
363# Call deinit() not end() because there can be multiple plugin passes:
364# one for doc level and another for section level indexing
365# and deinit() should be called before all passes
366# This way, we can close the SQL database once per buildcol run.
367sub deinit {
368    my ($self) = shift (@_);
369   
370    print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n";
371   
372    if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have
373    # a value except during buildcol, so when processor =~ m/buildproc$/.
374    $self->{'gs_sql'}->disconnect_from_db()
375        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
376
377    # explicitly set to undef so all future use has to make the connection again
378    undef $self->{'gs_sql'};
379    }
380
381    $self->SUPER::deinit(@_);
382}
383
384# TODO: This can't work until GSSQLPlugout has implemented build_mode = incremental
385# (instead of tossing away db on every build)
386# then this method needs to undef $self->docid after deleting, and close_doc() has to
387# just return if $self->docid undefined
388
389sub read {
390    my $self = shift (@_); 
391 
392    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
393
394    my $rv = $self->SUPER::read(@_); # defined in ReadXMLFile inherited by superclass GS XML PLugin
395
396    if(defined $rv) { # undef if !can_proc_this_file, but -1 if failed to parse docsql.xml
397
398    # don't want to do any GreenstoneSQLPlugin DB stuff during import.pl
399    # only during in buildcol.pl   
400    return if (ref($processor) !~ m/buildproc$/i);
401
402    # we know we're buildcol, let's proceed:
403
404    # make the connection once for the life of the plugin, not once for every doc
405    # so that we can disconnect at the very end of the plugin's life: on deinit()
406    # If we hadn't connected before, connect now
407    my $gs_sql = $self->{'gs_sql'} || $self->lazy_get_gssql(); # TODO which syntax best?
408   
409    my $build_proc_mode = $processor->get_mode(); # can be "text" as per basebuildproc or
410    # "textdelete" or "textreindex" as per ArchivesInfPlugin
411    if($build_proc_mode =~ m/\.delete/) {
412
413        # NOTTODO: add current doc OID stored in $self->{'doc_oid'} to list of oids get rid
414        # of from table(s) entries. We'll do the actual deletion in deinit?? Since that's
415        # when ArchivesInfPlugin deletes the docsql.xml files
416       
417        my $doc_oid = $self->{'doc_oid'};
418        #my @delete_docids = $self->{'delete_docids'};
419        #push (@delete_docids, $doc_oid);
420
421        my $proc_mode = $self->{'process_mode'};
422        if($proc_mode eq "all" || $proc_mode eq "meta_only") {
423        print STDERR "@@@@@@@@ Deleting $doc_oid from meta table\n";
424        $gs_sql->delete_recs_from_metatable_with_docid($doc_oid);
425        }
426        if($proc_mode eq "all" || $proc_mode eq "text_only") {
427        print STDERR "@@@@@@@@ Deleting $doc_oid from fulltxt table\n";
428        $gs_sql->delete_recs_from_texttable_with_docid($doc_oid);
429        }
430    }
431    }
432
433    return $rv;
434   
435}
436
Note: See TracBrowser for help on using the browser.