root/main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm @ 32542

Revision 32542, 14.7 KB (checked in by ak19, 9 months ago)

Instead of the docoid being stored in the docsql-<OID>.xml filename, all filenames produced are back to being docsql.xml, but the root element Archive now contains the doc oid as attribute: <Archive docoid="oid">

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
42# Ask about docsql naming convention adopted to identify OID. Better way?
43# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
44# + Startup parameters (except removeold/build_mode)
45# - incremental building: where do we need to add code to delete rows from our sql table after
46# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
47# - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in?
48# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
49# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
50# - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
51# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order
52
53# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
54# is still written out to doc.xml (docsql .xml), that will be processed as usual,
55# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
56# is written out by GreenstoneSQLPlugout into the SQL db).
57
58# TODO:
59# no more docoid in docsql .xml filename, set OID as attribute of root element inside docsql.xml file instead
60# and parse it out
61
62# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
63# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
64
65# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
66# Discuss the plugin/plugout parameters.
67
68sub BEGIN {
69    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
70}
71
72# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
73# So we won't have a process exp conflict here.
74# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
75#   - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
76#   this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
77#   - the root element Archive now has a docoid attribute: <Archive docoid="OID">
78sub get_default_process_exp {
79    my $self = shift (@_);
80
81    return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
82    #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename
83}
84
85my $process_mode_list =
86    [ { 'name' => "meta_only",
87        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
88      { 'name' => "text_only",
89        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
90      { 'name' => "all",
91        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
92
93my $arguments =
94    [ { 'name' => "process_exp",
95    'desc' => "{BaseImporter.process_exp}",
96    'type' => "regexp",
97    'deft' => &get_default_process_exp(),
98    'reqd' => "no" },
99      { 'name' => "process_mode",
100    'desc' => "{GreenstoneSQLPlug.process_mode}",
101    'type' => "enum",
102    'list' => $process_mode_list,
103    'deft' => "all",
104    'reqd' => "no"},
105      { 'name' => "db_driver",
106    'desc' => "{GreenstoneSQLPlug.db_driver}",
107    'type' => "string",
108    'deft' => "mysql",
109    'reqd' => "yes"},
110      { 'name' => "db_client_user",
111    'desc' => "{GreenstoneSQLPlug.db_client_user}",
112    'type' => "string",
113    'deft' => "root",
114    'reqd' => "yes"},
115      { 'name' => "db_client_pwd",
116    'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
117    'type' => "string",
118    'deft' => "",
119    'reqd' => "yes"}, # pwd required?
120      { 'name' => "db_host",
121    'desc' => "{GreenstoneSQLPlug.db_host}",
122    'type' => "string",
123    'deft' => "127.0.0.1",
124    'reqd' => "yes"},
125      { 'name' => "db_encoding",
126    'desc' => "{GreenstoneSQLPlug.db_encoding}",
127    'type' => "string",
128    'deft' => "utf8",
129    'reqd' => "yes"}
130    ];
131
132my $options = { 'name'     => "GreenstoneSQLPlugin",
133        'desc'     => "{GreenstoneSQLPlugin.desc}",
134        'abstract' => "no",
135        'inherits' => "yes",
136            'args'     => $arguments };
137
138
139# TODO: For on cancel, add a SIGTERM handler or so to call end()
140# or to explicitly call gs_sql->close_connection if $gs_sql def
141
142sub new {
143    my ($class) = shift (@_);
144    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
145    push(@$pluginlist, $class);
146
147    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
148    push(@{$hashArgOptLists->{"OptList"}},$options);
149
150    my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
151
152   
153    #return bless $self, $class;
154    $self = bless $self, $class;
155    if ($self->{'info_only'}) {
156    # If running pluginfo, we don't need to go further.
157    return $self;
158    }
159
160    # do anything else that needs to be done here when not pluginfo
161   
162    return $self;
163}
164
165sub xml_start_tag {
166    my $self = shift(@_);
167    my ($expat, $element) = @_;
168
169    my $outhandle = $self->{'outhandle'};
170   
171    $self->{'element'} = $element;
172    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
173    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
174    # contains the tag)
175
176    # Don't access %_{'docoid'} directly: keep getting a warning message to
177    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
178    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
179    my %attr_hash = %_; # right way, see OAIPlugin.pm
180    $self->{'doc_oid'} = $attr_hash{'docoid'}; 
181    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
182        if $self->{'verbosity'} > 1;
183
184    }
185    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
186    $self->SUPER::xml_start_tag(@_);
187    }
188}
189
190# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
191
192# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
193# the doc_obj in memory is processed (indexed) and then made undef.
194# So we have to work with doc_obj before superclass close_document() is finished.
195sub close_document {
196    my $self = shift(@_);
197   
198    my $outhandle = $self->{'outhandle'};
199    my $doc_obj = $self->{'doc_obj'};   
200    my $gs_sql = $self->{'gs_sql'};
201
202    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
203    print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n"
204    if $self->{'verbosity'} > 1;
205
206   
207    # For now, we have access to doc_obj (until just before super::close_document() terminates)
208   
209    $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index?
210   
211    # checking that complicated looking method set_OID() hasn't modified oid
212    if($oid ne $self->{'doc_obj'}->get_OID()) {
213    print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n";
214    }
215
216   
217    # TODO: This function is called on a per doc.xml file basis
218    # but we can process all docs of a collection in one go when dealing with the SQL tables for
219    # the collection. How and where should we read in the collection tables then?
220    # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl
221    # and if that file is detected, then MySQLPlugin::read() is passed in that file during
222    # buildcol.pl. And that file will trigger reading the 2 tables for the collection???
223    my $proc_mode = $self->{'process_mode'};
224    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
225    # read in meta for the collection (i.e. select * from <col>_metadata table
226
227    my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
228    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
229
230    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
231    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
232    while( my @row = $sth->fetchrow_array() ) {     
233        #print $outhandle "row: @row\n";
234        my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
235
236        # get rid of the artificial "root" introduced in section id when saving to sql db
237        $sid =~ s@^root@@;
238        $sid = $doc_obj->get_top_section() unless $sid;
239        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
240        if $self->{'verbosity'} > 1;
241       
242        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
243        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
244    }
245    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
246        if $self->{'verbosity'} > 1;
247    }
248   
249    if($proc_mode eq "all" || $proc_mode eq "text_only") {
250    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
251
252    my $fulltxt_table = $gs_sql->get_fulltext_table_name();
253   
254   
255    my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
256    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
257
258    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
259        if $self->{'verbosity'} > 1;
260    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
261
262        # get rid of the artificial "root" introduced in section id when saving to sql db
263        #$sid =~ s@^root@@;
264        $sid = $doc_obj->get_top_section() if ($sid eq "root");
265        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
266        if $self->{'verbosity'} > 1;
267
268        # TODO - pass by ref?
269        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
270        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
271    }   
272    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
273        if $self->{'verbosity'} > 1;
274    }
275
276   
277    # don't forget to clean up on close() in superclass
278    # It will get the doc_obj indexed then make it undef
279    $self->SUPER::close_document(@_);
280}
281
282
283# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl
284# call init() not begin() because there can be multiple plugin passes
285# and init() should be called before all passes:
286# one for doc level and another for section level indexing
287# This way, we can connect to the SQL database once per buildcol run.
288sub init {
289    my ($self) = shift (@_);
290#    print STDERR "@@@@@@@@@@ INIT CALLED\n";
291   
292    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
293
294    ####################
295#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
296#    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
297
298#    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
299#    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
300#    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
301#    print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n";
302#    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
303    ####################
304   
305    my $gs_sql = new gssql({
306    'collection_name' => $ENV{'GSDLCOLLECTION'},   
307    'db_encoding' => $self->{'db_encoding'}
308               }
309    );
310
311    # try connecting to the mysql db, if that fails it will die
312    if(!$gs_sql->connect_to_db({
313    'db_driver' => $self->{'db_driver'},
314    'db_client_user' => $self->{'db_client_user'},
315    'db_client_pwd' => $self->{'db_client_pwd'},
316    'db_host' => $self->{'db_host'}
317                   })
318    )
319    {
320    # This is fatal for the plugout, let's terminate here
321    # PrintError would already have displayed the warning message on connection fail   
322    die("Could not connect to db. Can't proceed.\n");
323    }
324   
325    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
326    #my $build_mode = $self->{'build_mode'} || "removeold";
327
328    # the db and its tables should exist. Attempt to use the db:
329    if(!$gs_sql->use_db($db_name)) {
330   
331    # This is fatal for the plugout, let's terminate here after disconnecting again
332    # PrintError would already have displayed the warning message on load fail
333    $gs_sql->disconnect_from_db()
334        || warn("Unable to disconnect from database.\n");
335    die("Could not use db $db_name. Can't proceed.\n");
336    }
337   
338    # store db handle now that we're connected
339    $self->{'gs_sql'} = $gs_sql;
340   
341}
342
343# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
344# Call deinit() not end() because there can be multiple plugin passes:
345# one for doc level and another for section level indexing
346# and deinit() should be called before all passes
347# This way, we can close the SQL database once per buildcol run.
348sub deinit {
349    my ($self) = shift (@_);
350    if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
351    $self->{'gs_sql'}->disconnect_from_db()
352        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
353    }
354#    print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
355    $self->SUPER::deinit(@_);
356}
357
Note: See TracBrowser for help on using the browser.