root/main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm @ 32543

Revision 32543, 14.2 KB (checked in by ak19, 9 months ago)

Tidying up and adjusting TODO statements

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
42# + Ask about docsql naming convention adopted to identify OID. Better way?
43# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
44# + Startup parameters (except removeold/build_mode)
45# - how do we detect we're to do removeold during plugout in import.pl phase
46# - incremental building: where do we need to add code to delete rows from our sql table after
47# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
48# - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
49# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
50# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
51# - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
52# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order
53
54# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
55# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
56
57# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
58# Discuss the plugin/plugout parameters.
59
60
61# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
62# is still written out to doc.xml (docsql .xml), that will be processed as usual,
63# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
64# is written out by GreenstoneSQLPlugout into the SQL db).
65
66
67sub BEGIN {
68    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
69}
70
71# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
72# So we won't have a process exp conflict here.
73# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
74#   - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
75#   this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
76#   - the root element Archive now has a docoid attribute: <Archive docoid="OID">
77sub get_default_process_exp {
78    my $self = shift (@_);
79
80    return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
81    #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename
82}
83
84my $process_mode_list =
85    [ { 'name' => "meta_only",
86        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
87      { 'name' => "text_only",
88        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
89      { 'name' => "all",
90        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
91
92my $arguments =
93    [ { 'name' => "process_exp",
94    'desc' => "{BaseImporter.process_exp}",
95    'type' => "regexp",
96    'deft' => &get_default_process_exp(),
97    'reqd' => "no" },
98      { 'name' => "process_mode",
99    'desc' => "{GreenstoneSQLPlug.process_mode}",
100    'type' => "enum",
101    'list' => $process_mode_list,
102    'deft' => "all",
103    'reqd' => "no"},
104      { 'name' => "db_driver",
105    'desc' => "{GreenstoneSQLPlug.db_driver}",
106    'type' => "string",
107    'deft' => "mysql",
108    'reqd' => "yes"},
109      { 'name' => "db_client_user",
110    'desc' => "{GreenstoneSQLPlug.db_client_user}",
111    'type' => "string",
112    'deft' => "root",
113    'reqd' => "yes"},
114      { 'name' => "db_client_pwd",
115    'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
116    'type' => "string",
117    'deft' => "",
118    'reqd' => "yes"}, # pwd required?
119      { 'name' => "db_host",
120    'desc' => "{GreenstoneSQLPlug.db_host}",
121    'type' => "string",
122    'deft' => "127.0.0.1",
123    'reqd' => "yes"},
124      { 'name' => "db_encoding",
125    'desc' => "{GreenstoneSQLPlug.db_encoding}",
126    'type' => "string",
127    'deft' => "utf8",
128    'reqd' => "yes"}
129    ];
130
131my $options = { 'name'     => "GreenstoneSQLPlugin",
132        'desc'     => "{GreenstoneSQLPlugin.desc}",
133        'abstract' => "no",
134        'inherits' => "yes",
135            'args'     => $arguments };
136
137
138# TODO: For on cancel, add a SIGTERM handler or so to call end()
139# or to explicitly call gs_sql->close_connection if $gs_sql def
140
141sub new {
142    my ($class) = shift (@_);
143    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
144    push(@$pluginlist, $class);
145
146    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
147    push(@{$hashArgOptLists->{"OptList"}},$options);
148
149    my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
150
151   
152    #return bless $self, $class;
153    $self = bless $self, $class;
154    if ($self->{'info_only'}) {
155    # If running pluginfo, we don't need to go further.
156    return $self;
157    }
158
159    # do anything else that needs to be done here when not pluginfo
160   
161    return $self;
162}
163
164sub xml_start_tag {
165    my $self = shift(@_);
166    my ($expat, $element) = @_;
167
168    my $outhandle = $self->{'outhandle'};
169   
170    $self->{'element'} = $element;
171    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
172    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
173    # contains the tag)
174
175    # Don't access %_{'docoid'} directly: keep getting a warning message to
176    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
177    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
178    my %attr_hash = %_; # right way, see OAIPlugin.pm
179    $self->{'doc_oid'} = $attr_hash{'docoid'}; 
180    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
181        if $self->{'verbosity'} > 1;
182
183    }
184    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
185    $self->SUPER::xml_start_tag(@_);
186    }
187}
188
189# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
190
191# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
192# the doc_obj in memory is processed (indexed) and then made undef.
193# So we have to work with doc_obj before superclass close_document() is finished.
194sub close_document {
195    my $self = shift(@_);
196   
197    my $outhandle = $self->{'outhandle'};
198    my $doc_obj = $self->{'doc_obj'};   
199    my $gs_sql = $self->{'gs_sql'};
200
201    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
202    print $outhandle "++++ OID of document (meta|text) to be read in from DB: $oid\n"
203    if $self->{'verbosity'} > 1;
204   
205    # For now, we have access to doc_obj (until just before super::close_document() terminates)
206
207    # no need to call $self->{'doc_obj'}->set_OID($oid);
208    # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata
209    # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata
210    # Either way, Identifier meta will be read into the docobj automatically with other meta.
211
212    my $proc_mode = $self->{'process_mode'};
213    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
214    # read in meta for the collection (i.e. select * from <col>_metadata table
215
216    my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
217    print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
218        if $self->{'verbosity'} > 1;
219
220    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
221    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
222    while( my @row = $sth->fetchrow_array() ) {     
223        #print $outhandle "row: @row\n";
224        my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
225
226        # get rid of the artificial "root" introduced in section id when saving to sql db
227        $sid =~ s@^root@@;
228        $sid = $doc_obj->get_top_section() unless $sid;
229        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
230        if $self->{'verbosity'} > 1;
231       
232        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
233        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
234    }
235    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
236        if $self->{'verbosity'} > 1;
237    }
238   
239    if($proc_mode eq "all" || $proc_mode eq "text_only") {
240    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
241
242    my $fulltxt_table = $gs_sql->get_fulltext_table_name();
243   
244   
245    my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
246    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
247
248    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
249        if $self->{'verbosity'} > 1;
250    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
251
252        # get rid of the artificial "root" introduced in section id when saving to sql db
253        #$sid =~ s@^root@@;
254        $sid = $doc_obj->get_top_section() if ($sid eq "root");
255        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
256        if $self->{'verbosity'} > 1;
257
258        # TODO - pass by ref?
259        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
260        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
261    }   
262    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
263        if $self->{'verbosity'} > 1;
264    }
265
266   
267    # don't forget to clean up on close() in superclass
268    # It will get the doc_obj indexed then make it undef
269    $self->SUPER::close_document(@_);
270}
271
272
273# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl.
274# During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
275# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
276
277# Call init() not begin() because there can be multiple plugin passes
278# and init() should be called before all passes:
279# one for doc level and another for section level indexing
280# This way, we can connect to the SQL database once per buildcol run.
281sub init {
282    my ($self) = shift (@_);
283#    print STDERR "@@@@@@@@@@ INIT CALLED\n";
284   
285    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
286
287    ####################
288#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
289#    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
290
291#    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
292#    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
293#    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
294#    print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n";
295#    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
296    ####################
297   
298    my $gs_sql = new gssql({
299    'collection_name' => $ENV{'GSDLCOLLECTION'},   
300    'db_encoding' => $self->{'db_encoding'}
301               }
302    );
303
304    # try connecting to the mysql db, if that fails it will die
305    if(!$gs_sql->connect_to_db({
306    'db_driver' => $self->{'db_driver'},
307    'db_client_user' => $self->{'db_client_user'},
308    'db_client_pwd' => $self->{'db_client_pwd'},
309    'db_host' => $self->{'db_host'}
310                   })
311    )
312    {
313    # This is fatal for the plugout, let's terminate here
314    # PrintError would already have displayed the warning message on connection fail   
315    die("Could not connect to db. Can't proceed.\n");
316    }
317   
318    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
319    #my $build_mode = $self->{'build_mode'} || "removeold";
320
321    # the db and its tables should exist. Attempt to use the db:
322    if(!$gs_sql->use_db($db_name)) {
323   
324    # This is fatal for the plugout, let's terminate here after disconnecting again
325    # PrintError would already have displayed the warning message on load fail
326    $gs_sql->disconnect_from_db()
327        || warn("Unable to disconnect from database.\n");
328    die("Could not use db $db_name. Can't proceed.\n");
329    }
330   
331    # store db handle now that we're connected
332    $self->{'gs_sql'} = $gs_sql;
333   
334}
335
336# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
337# Call deinit() not end() because there can be multiple plugin passes:
338# one for doc level and another for section level indexing
339# and deinit() should be called before all passes
340# This way, we can close the SQL database once per buildcol run.
341sub deinit {
342    my ($self) = shift (@_);
343    if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
344    $self->{'gs_sql'}->disconnect_from_db()
345        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
346    }
347#    print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
348    $self->SUPER::deinit(@_);
349}
350
Note: See TracBrowser for help on using the browser.