########################################################################### # # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-.xml # Metadata and/or fulltext are stored in SQL db, the rest may be stored in # the docsql .xml files. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package GreenstoneSQLPlugin; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use DBI; use docprint; # for new unescape_text() subroutine use GreenstoneXMLPlugin; use gssql; # TODO: # - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge. # Ask about docsql naming convention adopted to identify OID. Better way? # collection names -> table names: it seems hyphens not allowed. Changed to underscores. # - Startup parameters # - incremental building: where do we need to add code to delete rows from our sql table after # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? # - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in? # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that # back in from the sql db while the remainder is to be read back in from the docsql .xml files. # - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext # is still written out to doc.xml (docsql .xml), that will be processed as usual, # whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever # is written out by GreenstoneSQLPlugout into the SQL db). # TODO: # no more docoid in docsql .xml filename, set OID as attribute of root element inside docsql.xml file instead # and parse it out # TODO: deal with incremental vs removeold. If docs removed from import folder, then import step # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is sub BEGIN { @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); } # This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin # So we won't have a process exp conflict here. sub get_default_process_exp { my $self = shift (@_); #return q^(?i)docsql(-\d+)?\.xml$^; return q^(?i)docsql(-.+)?\.xml$^; } my $process_mode_list = [ { 'name' => "meta_only", 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" }, { 'name' => "text_only", 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" }, { 'name' => "all", 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ]; my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "process_mode", 'desc' => "{GreenstoneSQLPlug.process_mode}", 'type' => "enum", 'list' => $process_mode_list, 'deft' => "all", 'reqd' => "no"}, { 'name' => "db_driver", 'desc' => "{GreenstoneSQLPlug.db_driver}", 'type' => "string", 'deft' => "mysql", 'reqd' => "yes"}, { 'name' => "db_client_user", 'desc' => "{GreenstoneSQLPlug.db_client_user}", 'type' => "string", 'deft' => "root", 'reqd' => "yes"}, { 'name' => "db_client_pwd", 'desc' => "{GreenstoneSQLPlug.db_client_pwd}", 'type' => "string", 'deft' => "", 'reqd' => "yes"}, # pwd required? { 'name' => "db_host", 'desc' => "{GreenstoneSQLPlug.db_host}", 'type' => "string", 'deft' => "127.0.0.1", 'reqd' => "yes"}, { 'name' => "db_encoding", 'desc' => "{GreenstoneSQLPlug.db_encoding}", 'type' => "string", 'deft' => "utf8", 'reqd' => "yes"} ]; my $options = { 'name' => "GreenstoneSQLPlugin", 'desc' => "{GreenstoneSQLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; # TODO: For on cancel, add a SIGTERM handler or so to call end() # or to explicitly call gs_sql->close_connection if $gs_sql def sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists); #return bless $self, $class; $self = bless $self, $class; if ($self->{'info_only'}) { # If running pluginfo, we don't need to go further. return $self; } # do anything else that needs to be done here when not pluginfo return $self; } # TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)? # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, # the doc_obj in memory is processed (indexed) and then made undef. # So we have to work with doc_obj before superclass close_document() is finished. sub close_document { my $self = shift(@_); my $outhandle = $self->{'outhandle'}; my $doc_obj = $self->{'doc_obj'}; my $gs_sql = $self->{'gs_sql'}; my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read() print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n" if $self->{'verbosity'} > 1; # For now, we have access to doc_obj (until just before super::close_document() terminates) $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index? # checking that complicated looking method set_OID() hasn't modified oid if($oid ne $self->{'doc_obj'}->get_OID()) { print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n"; } # TODO: This function is called on a per doc.xml file basis # but we can process all docs of a collection in one go when dealing with the SQL tables for # the collection. How and where should we read in the collection tables then? # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl # and if that file is detected, then MySQLPlugin::read() is passed in that file during # buildcol.pl. And that file will trigger reading the 2 tables for the collection??? my $proc_mode = $self->{'process_mode'}; if($proc_mode eq "all" || $proc_mode eq "meta_only") { # read in meta for the collection (i.e. select * from _metadata table my $sth = $gs_sql->select_from_metatable_matching_docid($oid); print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1; print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1; # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ while( my @row = $sth->fetchrow_array() ) { #print $outhandle "row: @row\n"; my ($primary_key, $did, $sid, $metaname, $metaval) = @row; # get rid of the artificial "root" introduced in section id when saving to sql db $sid =~ s@^root@@; $sid = $doc_obj->get_top_section() unless $sid; print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" if $self->{'verbosity'} > 1; # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); } print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" if $self->{'verbosity'} > 1; } if($proc_mode eq "all" || $proc_mode eq "text_only") { # read in fulltxt for the collection (i.e. select * from _fulltxt table my $fulltxt_table = $gs_sql->get_fulltext_table_name(); my $sth = $gs_sql->select_from_texttable_matching_docid($oid); print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1; print $outhandle "----------\nSQL DB contains txt entries for-----------\n" if $self->{'verbosity'} > 1; while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) { # get rid of the artificial "root" introduced in section id when saving to sql db #$sid =~ s@^root@@; $sid = $doc_obj->get_top_section() if ($sid eq "root"); print $outhandle "### did: $did, sid: |$sid|, fulltext: \n" if $self->{'verbosity'} > 1; # TODO - pass by ref? # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); } print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" if $self->{'verbosity'} > 1; } # don't forget to clean up on close() in superclass # It will get the doc_obj indexed then make it undef $self->SUPER::close_document(@_); } # TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl # call init() not begin() because there can be multiple plugin passes # and init() should be called before all passes: # one for doc level and another for section level indexing # This way, we can connect to the SQL database once per buildcol run. sub init { my ($self) = shift (@_); # print STDERR "@@@@@@@@@@ INIT CALLED\n"; $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). #################### # print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'}; # print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n"; # print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n"; # print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n"; # print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n"; # print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n"; # print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; #################### my $gs_sql = new gssql({ 'collection_name' => $ENV{'GSDLCOLLECTION'}, 'db_encoding' => $self->{'db_encoding'} } ); # try connecting to the mysql db, if that fails it will die if(!$gs_sql->connect_to_db({ 'db_driver' => $self->{'db_driver'}, 'db_client_user' => $self->{'db_client_user'}, 'db_client_pwd' => $self->{'db_client_pwd'}, 'db_host' => $self->{'db_host'} }) ) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not connect to db. Can't proceed.\n"); } my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 #my $build_mode = $self->{'build_mode'} || "removeold"; # the db and its tables should exist. Attempt to use the db: if(!$gs_sql->use_db($db_name)) { # This is fatal for the plugout, let's terminate here after disconnecting again # PrintError would already have displayed the warning message on load fail $gs_sql->disconnect_from_db() || warn("Unable to disconnect from database.\n"); die("Could not use db $db_name. Can't proceed.\n"); } # store db handle now that we're connected $self->{'gs_sql'} = $gs_sql; } # This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol # Call deinit() not end() because there can be multiple plugin passes: # one for doc level and another for section level indexing # and deinit() should be called before all passes # This way, we can close the SQL database once per buildcol run. sub deinit { my ($self) = shift (@_); if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); } # print STDERR "@@@@@@@@@@ DEINIT CALLED\n"; $self->SUPER::deinit(@_); } sub read { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; # when running buildcol.pl, the filename should match "docsql-.xml" # when running import.pl it will be the original document's filename # we only want to read in from db when running buildcol.pl # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj) # is created in super::open_document() and is made undef again on super::close_document(). # Further, can't read it in from doc.xml to work out which OID to query in sql db: # even if we got access to doc_obj, if no meta stored in docsql.xml, then when # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self if($file =~ m/docsql-(.+?)\.xml$/) { # work out docoid from filename of form "docsql-.xml". $file can have a containing # subfolder besides filename, e.g. "dir/docsql-.xml" # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@; $self->{'doc_oid'} = $oid; } # always read docsql.xml, as we then know doc structure, and assoc files are dealt with # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj }