########################################################################### # # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml # Metadata and/or fulltext are stored in SQL db, the rest may be stored in # the docsql .xml files. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package GreenstoneSQLPlugin; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use DBI; use docprint; # for new unescape_text() subroutine use GreenstoneXMLPlugin; use gssql; # TODO: # - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge. # Ask about docsql naming convention adopted to identify OID. Better way? # collection names -> table names: it seems hyphens not allowed. Changed to underscores. # + Startup parameters (except removeold/build_mode) # - incremental building: where do we need to add code to delete rows from our sql table after # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? # - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in? # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that # back in from the sql db while the remainder is to be read back in from the docsql .xml files. # - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext # is still written out to doc.xml (docsql .xml), that will be processed as usual, # whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever # is written out by GreenstoneSQLPlugout into the SQL db). # TODO: # no more docoid in docsql .xml filename, set OID as attribute of root element inside docsql.xml file instead # and parse it out # TODO: deal with incremental vs removeold. If docs removed from import folder, then import step # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. # Discuss the plugin/plugout parameters. sub BEGIN { @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); } # This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin # So we won't have a process exp conflict here. # The structure of docsql.xml files is identical to doc.xml and the contents are similar except: # - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating # this are left inside docsql.xml within the (for meta) and/or (for txt) # - the root element Archive now has a docoid attribute: sub get_default_process_exp { my $self = shift (@_); return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename } my $process_mode_list = [ { 'name' => "meta_only", 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" }, { 'name' => "text_only", 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" }, { 'name' => "all", 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ]; my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "process_mode", 'desc' => "{GreenstoneSQLPlug.process_mode}", 'type' => "enum", 'list' => $process_mode_list, 'deft' => "all", 'reqd' => "no"}, { 'name' => "db_driver", 'desc' => "{GreenstoneSQLPlug.db_driver}", 'type' => "string", 'deft' => "mysql", 'reqd' => "yes"}, { 'name' => "db_client_user", 'desc' => "{GreenstoneSQLPlug.db_client_user}", 'type' => "string", 'deft' => "root", 'reqd' => "yes"}, { 'name' => "db_client_pwd", 'desc' => "{GreenstoneSQLPlug.db_client_pwd}", 'type' => "string", 'deft' => "", 'reqd' => "yes"}, # pwd required? { 'name' => "db_host", 'desc' => "{GreenstoneSQLPlug.db_host}", 'type' => "string", 'deft' => "127.0.0.1", 'reqd' => "yes"}, { 'name' => "db_encoding", 'desc' => "{GreenstoneSQLPlug.db_encoding}", 'type' => "string", 'deft' => "utf8", 'reqd' => "yes"} ]; my $options = { 'name' => "GreenstoneSQLPlugin", 'desc' => "{GreenstoneSQLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; # TODO: For on cancel, add a SIGTERM handler or so to call end() # or to explicitly call gs_sql->close_connection if $gs_sql def sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists); #return bless $self, $class; $self = bless $self, $class; if ($self->{'info_only'}) { # If running pluginfo, we don't need to go further. return $self; } # do anything else that needs to be done here when not pluginfo return $self; } sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; my $outhandle = $self->{'outhandle'}; $self->{'element'} = $element; if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ # contains the tag) # Don't access %_{'docoid'} directly: keep getting a warning message to # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. my %attr_hash = %_; # right way, see OAIPlugin.pm $self->{'doc_oid'} = $attr_hash{'docoid'}; print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" if $self->{'verbosity'} > 1; } else { # let superclass GreenstoneXMLPlugin continue to process
and elements $self->SUPER::xml_start_tag(@_); } } # TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)? # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, # the doc_obj in memory is processed (indexed) and then made undef. # So we have to work with doc_obj before superclass close_document() is finished. sub close_document { my $self = shift(@_); my $outhandle = $self->{'outhandle'}; my $doc_obj = $self->{'doc_obj'}; my $gs_sql = $self->{'gs_sql'}; my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n" if $self->{'verbosity'} > 1; # For now, we have access to doc_obj (until just before super::close_document() terminates) $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index? # checking that complicated looking method set_OID() hasn't modified oid if($oid ne $self->{'doc_obj'}->get_OID()) { print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n"; } # TODO: This function is called on a per doc.xml file basis # but we can process all docs of a collection in one go when dealing with the SQL tables for # the collection. How and where should we read in the collection tables then? # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl # and if that file is detected, then MySQLPlugin::read() is passed in that file during # buildcol.pl. And that file will trigger reading the 2 tables for the collection??? my $proc_mode = $self->{'process_mode'}; if($proc_mode eq "all" || $proc_mode eq "meta_only") { # read in meta for the collection (i.e. select * from _metadata table my $sth = $gs_sql->select_from_metatable_matching_docid($oid); print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1; print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1; # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ while( my @row = $sth->fetchrow_array() ) { #print $outhandle "row: @row\n"; my ($primary_key, $did, $sid, $metaname, $metaval) = @row; # get rid of the artificial "root" introduced in section id when saving to sql db $sid =~ s@^root@@; $sid = $doc_obj->get_top_section() unless $sid; print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" if $self->{'verbosity'} > 1; # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); } print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" if $self->{'verbosity'} > 1; } if($proc_mode eq "all" || $proc_mode eq "text_only") { # read in fulltxt for the collection (i.e. select * from _fulltxt table my $fulltxt_table = $gs_sql->get_fulltext_table_name(); my $sth = $gs_sql->select_from_texttable_matching_docid($oid); print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1; print $outhandle "----------\nSQL DB contains txt entries for-----------\n" if $self->{'verbosity'} > 1; while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) { # get rid of the artificial "root" introduced in section id when saving to sql db #$sid =~ s@^root@@; $sid = $doc_obj->get_top_section() if ($sid eq "root"); print $outhandle "### did: $did, sid: |$sid|, fulltext: \n" if $self->{'verbosity'} > 1; # TODO - pass by ref? # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); } print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" if $self->{'verbosity'} > 1; } # don't forget to clean up on close() in superclass # It will get the doc_obj indexed then make it undef $self->SUPER::close_document(@_); } # TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl # call init() not begin() because there can be multiple plugin passes # and init() should be called before all passes: # one for doc level and another for section level indexing # This way, we can connect to the SQL database once per buildcol run. sub init { my ($self) = shift (@_); # print STDERR "@@@@@@@@@@ INIT CALLED\n"; $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). #################### # print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'}; # print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n"; # print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n"; # print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n"; # print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n"; # print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n"; # print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; #################### my $gs_sql = new gssql({ 'collection_name' => $ENV{'GSDLCOLLECTION'}, 'db_encoding' => $self->{'db_encoding'} } ); # try connecting to the mysql db, if that fails it will die if(!$gs_sql->connect_to_db({ 'db_driver' => $self->{'db_driver'}, 'db_client_user' => $self->{'db_client_user'}, 'db_client_pwd' => $self->{'db_client_pwd'}, 'db_host' => $self->{'db_host'} }) ) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not connect to db. Can't proceed.\n"); } my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 #my $build_mode = $self->{'build_mode'} || "removeold"; # the db and its tables should exist. Attempt to use the db: if(!$gs_sql->use_db($db_name)) { # This is fatal for the plugout, let's terminate here after disconnecting again # PrintError would already have displayed the warning message on load fail $gs_sql->disconnect_from_db() || warn("Unable to disconnect from database.\n"); die("Could not use db $db_name. Can't proceed.\n"); } # store db handle now that we're connected $self->{'gs_sql'} = $gs_sql; } # This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol # Call deinit() not end() because there can be multiple plugin passes: # one for doc level and another for section level indexing # and deinit() should be called before all passes # This way, we can close the SQL database once per buildcol run. sub deinit { my ($self) = shift (@_); if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); } # print STDERR "@@@@@@@@@@ DEINIT CALLED\n"; $self->SUPER::deinit(@_); }