root/main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm @ 32541

Revision 32541, 14.5 KB (checked in by ak19, 9 months ago)

Using proper parameters to GreenstoneSQLPlugin/Plugout instead of hardcoded values for params.

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
42# Ask about docsql naming convention adopted to identify OID. Better way?
43# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
44# - Startup parameters
45# - incremental building: where do we need to add code to delete rows from our sql table after
46# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
47# - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in?
48# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
49# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
50# - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
51# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order
52
53# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
54# is still written out to doc.xml (docsql .xml), that will be processed as usual,
55# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
56# is written out by GreenstoneSQLPlugout into the SQL db).
57
58# TODO:
59# no more docoid in docsql .xml filename, set OID as attribute of root element inside docsql.xml file instead
60# and parse it out
61
62# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
63# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
64
65sub BEGIN {
66    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
67}
68
69# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
70# So we won't have a process exp conflict here.
71sub get_default_process_exp {
72    my $self = shift (@_);
73
74    #return q^(?i)docsql(-\d+)?\.xml$^;
75    return q^(?i)docsql(-.+)?\.xml$^;
76}
77
78my $process_mode_list =
79    [ { 'name' => "meta_only",
80        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
81      { 'name' => "text_only",
82        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
83      { 'name' => "all",
84        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
85
86my $arguments =
87    [ { 'name' => "process_exp",
88    'desc' => "{BaseImporter.process_exp}",
89    'type' => "regexp",
90    'deft' => &get_default_process_exp(),
91    'reqd' => "no" },
92      { 'name' => "process_mode",
93    'desc' => "{GreenstoneSQLPlug.process_mode}",
94    'type' => "enum",
95    'list' => $process_mode_list,
96    'deft' => "all",
97    'reqd' => "no"},
98      { 'name' => "db_driver",
99    'desc' => "{GreenstoneSQLPlug.db_driver}",
100    'type' => "string",
101    'deft' => "mysql",
102    'reqd' => "yes"},
103      { 'name' => "db_client_user",
104    'desc' => "{GreenstoneSQLPlug.db_client_user}",
105    'type' => "string",
106    'deft' => "root",
107    'reqd' => "yes"},
108      { 'name' => "db_client_pwd",
109    'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
110    'type' => "string",
111    'deft' => "",
112    'reqd' => "yes"}, # pwd required?
113      { 'name' => "db_host",
114    'desc' => "{GreenstoneSQLPlug.db_host}",
115    'type' => "string",
116    'deft' => "127.0.0.1",
117    'reqd' => "yes"},
118      { 'name' => "db_encoding",
119    'desc' => "{GreenstoneSQLPlug.db_encoding}",
120    'type' => "string",
121    'deft' => "utf8",
122    'reqd' => "yes"}
123    ];
124
125my $options = { 'name'     => "GreenstoneSQLPlugin",
126        'desc'     => "{GreenstoneSQLPlugin.desc}",
127        'abstract' => "no",
128        'inherits' => "yes",
129            'args'     => $arguments };
130
131
132# TODO: For on cancel, add a SIGTERM handler or so to call end()
133# or to explicitly call gs_sql->close_connection if $gs_sql def
134
135sub new {
136    my ($class) = shift (@_);
137    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
138    push(@$pluginlist, $class);
139
140    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
141    push(@{$hashArgOptLists->{"OptList"}},$options);
142
143    my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
144
145   
146    #return bless $self, $class;
147    $self = bless $self, $class;
148    if ($self->{'info_only'}) {
149    # If running pluginfo, we don't need to go further.
150    return $self;
151    }
152
153    # do anything else that needs to be done here when not pluginfo
154   
155    return $self;
156}
157
158
159# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
160
161# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
162# the doc_obj in memory is processed (indexed) and then made undef.
163# So we have to work with doc_obj before superclass close_document() is finished.
164sub close_document {
165    my $self = shift(@_);
166   
167    my $outhandle = $self->{'outhandle'};
168    my $doc_obj = $self->{'doc_obj'};   
169    my $gs_sql = $self->{'gs_sql'};
170
171    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()
172    print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n"
173    if $self->{'verbosity'} > 1;
174
175   
176    # For now, we have access to doc_obj (until just before super::close_document() terminates)
177   
178    $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index?
179   
180    # checking that complicated looking method set_OID() hasn't modified oid
181    if($oid ne $self->{'doc_obj'}->get_OID()) {
182    print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n";
183    }
184
185   
186    # TODO: This function is called on a per doc.xml file basis
187    # but we can process all docs of a collection in one go when dealing with the SQL tables for
188    # the collection. How and where should we read in the collection tables then?
189    # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl
190    # and if that file is detected, then MySQLPlugin::read() is passed in that file during
191    # buildcol.pl. And that file will trigger reading the 2 tables for the collection???
192    my $proc_mode = $self->{'process_mode'};
193    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
194    # read in meta for the collection (i.e. select * from <col>_metadata table
195
196    my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
197    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
198
199    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
200    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
201    while( my @row = $sth->fetchrow_array() ) {     
202        #print $outhandle "row: @row\n";
203        my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
204
205        # get rid of the artificial "root" introduced in section id when saving to sql db
206        $sid =~ s@^root@@;
207        $sid = $doc_obj->get_top_section() unless $sid;
208        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
209        if $self->{'verbosity'} > 1;
210       
211        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
212        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
213    }
214    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
215        if $self->{'verbosity'} > 1;
216    }
217   
218    if($proc_mode eq "all" || $proc_mode eq "text_only") {
219    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
220
221    my $fulltxt_table = $gs_sql->get_fulltext_table_name();
222   
223   
224    my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
225    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
226
227    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
228        if $self->{'verbosity'} > 1;
229    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
230
231        # get rid of the artificial "root" introduced in section id when saving to sql db
232        #$sid =~ s@^root@@;
233        $sid = $doc_obj->get_top_section() if ($sid eq "root");
234        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
235        if $self->{'verbosity'} > 1;
236
237        # TODO - pass by ref?
238        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
239        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
240    }   
241    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
242        if $self->{'verbosity'} > 1;
243    }
244
245   
246    # don't forget to clean up on close() in superclass
247    # It will get the doc_obj indexed then make it undef
248    $self->SUPER::close_document(@_);
249}
250
251
252# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl
253# call init() not begin() because there can be multiple plugin passes
254# and init() should be called before all passes:
255# one for doc level and another for section level indexing
256# This way, we can connect to the SQL database once per buildcol run.
257sub init {
258    my ($self) = shift (@_);
259#    print STDERR "@@@@@@@@@@ INIT CALLED\n";
260   
261    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
262
263    ####################
264#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
265#    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
266
267#    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
268#    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
269#    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
270#    print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n";
271#    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
272    ####################
273   
274    my $gs_sql = new gssql({
275    'collection_name' => $ENV{'GSDLCOLLECTION'},   
276    'db_encoding' => $self->{'db_encoding'}
277               }
278    );
279
280    # try connecting to the mysql db, if that fails it will die
281    if(!$gs_sql->connect_to_db({
282    'db_driver' => $self->{'db_driver'},
283    'db_client_user' => $self->{'db_client_user'},
284    'db_client_pwd' => $self->{'db_client_pwd'},
285    'db_host' => $self->{'db_host'}
286                   })
287    )
288    {
289    # This is fatal for the plugout, let's terminate here
290    # PrintError would already have displayed the warning message on connection fail   
291    die("Could not connect to db. Can't proceed.\n");
292    }
293   
294    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
295    #my $build_mode = $self->{'build_mode'} || "removeold";
296
297    # the db and its tables should exist. Attempt to use the db:
298    if(!$gs_sql->use_db($db_name)) {
299   
300    # This is fatal for the plugout, let's terminate here after disconnecting again
301    # PrintError would already have displayed the warning message on load fail
302    $gs_sql->disconnect_from_db()
303        || warn("Unable to disconnect from database.\n");
304    die("Could not use db $db_name. Can't proceed.\n");
305    }
306   
307    # store db handle now that we're connected
308    $self->{'gs_sql'} = $gs_sql;
309   
310}
311
312# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
313# Call deinit() not end() because there can be multiple plugin passes:
314# one for doc level and another for section level indexing
315# and deinit() should be called before all passes
316# This way, we can close the SQL database once per buildcol run.
317sub deinit {
318    my ($self) = shift (@_);
319    if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
320    $self->{'gs_sql'}->disconnect_from_db()
321        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
322    }
323#    print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
324    $self->SUPER::deinit(@_);
325}
326
327sub read {
328    my $self = shift (@_); 
329 
330    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
331
332    # when running buildcol.pl, the filename should match "docsql-<OID>.xml"
333    # when running import.pl it will be the original document's filename
334    # we only want to read in from db when running buildcol.pl
335
336    # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)
337    # is created in super::open_document() and is made undef again on super::close_document().
338    # Further, can't read it in from doc.xml to work out which OID to query in sql db:
339    # even if we got access to doc_obj, if no meta stored in docsql.xml, then when
340    # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename
341    # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self
342    if($file =~ m/docsql-(.+?)\.xml$/) {
343
344    # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing
345    # subfolder besides filename, e.g. "dir/docsql-<OID>.xml"
346   
347    # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original
348    (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;
349
350    $self->{'doc_oid'} = $oid;
351    }   
352
353   
354    # always read docsql.xml, as we then know doc structure, and assoc files are dealt with
355    # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db
356    return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj
357
358   
359}
Note: See TracBrowser for help on using the browser.