########################################################################### # # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml # Metadata and/or fulltext are stored in SQL db, the rest may be stored in # the docsql .xml files. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package GreenstoneSQLPlugin; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use DBI; use docprint; # for new unescape_text() subroutine use GreenstoneXMLPlugin; use gssql; # TODO: # - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge. # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that # back in from the sql db while the remainder is to be read back in from the docsql .xml files. # TODO: deal with incremental vs removeold. If docs removed from import folder, then import step # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is # TODO Q: is "reindex" = del from db + add to db? # - is this okay for reindexing, or will it need to modify existing values (update table) # - if it's okay, what does reindex need to accomplish (and how) if the OID changes because hash id produced is different? # - delete is accomplished in GS SQL Plugin, during buildcol.pl. When should reindexing take place? # during SQL plugout/import.pl or during plugin? If adding is done by GSSQLPlugout, does it need to # be reimplemented in GSSQLPlugin to support the adding portion of reindexing. # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking: basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. # Discuss the plugin/plugout parameters. # DONE: # + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental # (instead of tossing away db on every build) # + Ask about docsql naming convention adopted to identify OID. Better way? # collection names -> table names: it seems hyphens not allowed. Changed to underscores. # + Startup parameters (except removeold/build_mode) # + how do we detect we're to do removeold during plugout in import.pl phase # + incremental building: where do we need to add code to delete rows from our sql table after # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? # + Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order. # YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column # + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? # NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not # running. The GLI "paralyzing" incident last time was not because of the gs sql connection code, # but because my computer was freezing on-and-off. # + "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? # Test doc with meta and text like macron in Maori text. # + TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. # This got fixed when GSSQLPlugin stopped connecting on init(). ######################################################################################## # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext # is still written out to doc.xml (docsql .xml), that will be processed as usual, # whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever # is written out by GreenstoneSQLPlugout into the SQL db). sub BEGIN { @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); } # This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin # So we won't have a process exp conflict here. # The structure of docsql.xml files is identical to doc.xml and the contents are similar except: # - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating # this are left inside docsql.xml within the (for meta) and/or (for txt) # - the root element Archive now has a docoid attribute: sub get_default_process_exp { my $self = shift (@_); return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename } my $process_mode_list = [ { 'name' => "meta_only", 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" }, { 'name' => "text_only", 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" }, { 'name' => "all", 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ]; my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "process_mode", 'desc' => "{GreenstoneSQLPlug.process_mode}", 'type' => "enum", 'list' => $process_mode_list, 'deft' => "all", 'reqd' => "no"}, { 'name' => "db_driver", 'desc' => "{GreenstoneSQLPlug.db_driver}", 'type' => "string", 'deft' => "mysql", 'reqd' => "yes"}, { 'name' => "db_client_user", 'desc' => "{GreenstoneSQLPlug.db_client_user}", 'type' => "string", 'deft' => "root", 'reqd' => "yes"}, { 'name' => "db_client_pwd", 'desc' => "{GreenstoneSQLPlug.db_client_pwd}", 'type' => "string", 'deft' => "", 'reqd' => "yes"}, # pwd required? { 'name' => "db_host", 'desc' => "{GreenstoneSQLPlug.db_host}", 'type' => "string", 'deft' => "127.0.0.1", 'reqd' => "yes"}, ]; my $options = { 'name' => "GreenstoneSQLPlugin", 'desc' => "{GreenstoneSQLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; # TODO: For on cancel, add a SIGTERM handler or so to call end() # or to explicitly call gs_sql->close_connection if $gs_sql def sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists); #return bless $self, $class; $self = bless $self, $class; if ($self->{'info_only'}) { # If running pluginfo, we don't need to go further. return $self; } # do anything else that needs to be done here when not pluginfo return $self; } # This is called once if removeold is set with import.pl. Most plugins will do # nothing but if a plugin does any stuff outside of creating doc obj, then # it may need to clear something. # In the case of GreenstoneSQL plugs: this is the first time we have a chance # to purge the tables of the current collection from the current site's database sub remove_all { my $self = shift (@_); my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; print STDERR " Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'}); # if we're in here, we'd already have run 'use database ;' during sub init() # so we can go ahead and delete the collection's tables my $gs_sql = $self->{'gs_sql'}; $gs_sql->delete_collection_tables(); # will delete them if they exist # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin() my $proc_mode = $self->{'process_mode'}; if($proc_mode ne "text_only") { $gs_sql->ensure_meta_table_exists(); } if($proc_mode ne "meta_only") { $gs_sql->ensure_fulltxt_table_exists(); } } # This is called per document for docs that have been deleted from the # collection. Most plugins will do nothing # but if a plugin does any stuff outside of creating doc obj, then it may need # to clear something. # remove the doc(s) denoted by oids from GS SQL db # This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building # incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm sub remove_one { my $self = shift (@_); my ($file, $oids, $archivedir) = @_; print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n"; #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS: # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED. # WE CARE ABOUT REMOVING THE DOCOID OF THAT IMAGE FILE FROM THE DB print STDERR "*****************************\nAsked to remove_one oid\n***********************\n"; my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc if(scalar @$oids > 1) { print STDERR "TODO: We now have reason to optimise GreenstoneSQLPlugin::remove_one() by using prepare and execute.\n"; } my $proc_mode = $self->{'process_mode'}; foreach my $oid (@$oids) { if($proc_mode eq "all" || $proc_mode eq "meta_only") { print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2; $gs_sql->delete_recs_from_metatable_with_docid($oid); } if($proc_mode eq "all" || $proc_mode eq "text_only") { print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2; $gs_sql->delete_recs_from_texttable_with_docid($oid); } } return 1; } sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; my $outhandle = $self->{'outhandle'}; $self->{'element'} = $element; if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ # contains the tag) # Don't access %_{'docoid'} directly: keep getting a warning message to # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. my %attr_hash = %_; # right way, see OAIPlugin.pm $self->{'doc_oid'} = $attr_hash{'docoid'}; ##print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n"; print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" if $self->{'verbosity'} > 2; } else { # let superclass GreenstoneXMLPlugin continue to process
and elements $self->SUPER::xml_start_tag(@_); } } # TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb? # We should only ever get here during the buildcol.pl phase # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, # the doc_obj in memory is processed (indexed) and then made undef. # So we have to work with doc_obj before superclass close_document() is finished. sub close_document { my $self = shift(@_); ##print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n"; my $gs_sql = $self->{'gs_sql'}; my $outhandle = $self->{'outhandle'}; my $doc_obj = $self->{'doc_obj'}; my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() my $proc_mode = $self->{'process_mode'}; # For now, we have access to doc_obj (until just before super::close_document() terminates) # no need to call $self->{'doc_obj'}->set_OID($oid); # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata # Either way, Identifier meta will be read into the docobj automatically with other meta. print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n" if $self->{'verbosity'} > 0; if($proc_mode eq "all" || $proc_mode eq "meta_only") { # read in meta for the collection (i.e. select * from _metadata table my $sth = $gs_sql->select_from_metatable_matching_docid($oid); print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ while( my @row = $sth->fetchrow_array() ) { #print $outhandle "row: @row\n"; my ($primary_key, $did, $sid, $metaname, $metaval) = @row; # get rid of the artificial "root" introduced in section id when saving to sql db $sid =~ s@^root@@; $sid = $doc_obj->get_top_section() unless $sid; print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" if $self->{'verbosity'} > 2; # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); } print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" if $self->{'verbosity'} > 2; } if($proc_mode eq "all" || $proc_mode eq "text_only") { # read in fulltxt for the collection (i.e. select * from _fulltxt table my $fulltxt_table = $gs_sql->get_fulltext_table_name(); my $sth = $gs_sql->select_from_texttable_matching_docid($oid); print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; print $outhandle "----------\nSQL DB contains txt entries for-----------\n" if $self->{'verbosity'} > 2; while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) { # get rid of the artificial "root" introduced in section id when saving to sql db #$sid =~ s@^root@@; $sid = $doc_obj->get_top_section() if ($sid eq "root"); print $outhandle "### did: $did, sid: |$sid|, fulltext: \n" if $self->{'verbosity'} > 2; # TODO - pass by ref? # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); } print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" if $self->{'verbosity'} > 2; } # done reading into docobj from SQL db # don't forget to clean up on close() in superclass # It will get the doc_obj indexed then make it undef $self->SUPER::close_document(@_); } # TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl. # During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. # GS SQL Plugin::init() (and deinit()) is called by import.pl and also by buildcol.pl # This means it connects and deconnects during import.pl as well. This is okay # as removeold, which should drop the collection tables, happens during the import phase # and therefore also requires a db connection. # TODO: Eventually can try moving get_gssql_instance into gssql.pm? That way both GS SQL Plugin # and Plugout would be using one connection during import.pl phase when both plugs exist. # Call init() not begin() because there can be multiple plugin passes and begin() called for # each pass (one for doc level and another for section level indexing), whereas init() should # be called before any and all passes. # This way, we can connect to the SQL database once per buildcol run. sub init { my ($self) = shift (@_); ##print STDERR "@@@@@@@@@@ INIT CALLED\n"; $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). #################### # print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'}; # print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n"; # print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n"; # print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n"; # print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n"; # print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; #################### # create gssql object. # collection name will be used for naming tables (site name will be used for naming database) my $gs_sql = new gssql({ 'collection_name' => $ENV{'GSDLCOLLECTION'}, 'verbosity' => $self->{'verbosity'} }); # try connecting to the mysql db, if that fails it will die if(!$gs_sql->connect_to_db({ 'db_driver' => $self->{'db_driver'}, 'db_client_user' => $self->{'db_client_user'}, 'db_client_pwd' => $self->{'db_client_pwd'}, 'db_host' => $self->{'db_host'} }) ) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not connect to db. Can't proceed.\n"); } my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet) # Bail if we can't use the database if(!$gs_sql->use_db($db_name)) { # This is fatal for the plugout, let's terminate here after disconnecting again # PrintError would already have displayed the warning message on load fail $gs_sql->disconnect_from_db() || warn("Unable to disconnect from database.\n"); die("Could not use db $db_name. Can't proceed.\n"); } # store db handle now that we're connected $self->{'gs_sql'} = $gs_sql; } # This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol # Call deinit() not end() because there can be multiple plugin passes: # one for doc level and another for section level indexing # and deinit() should be called before all passes # This way, we can close the SQL database once per buildcol run. sub deinit { my ($self) = shift (@_); ##print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have # a value except during buildcol, so when processor =~ m/buildproc$/. $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # explicitly delete gs_sql key (setting key to undef has a different meaning from deleting) # so all future use has to make the connection again delete $self->{'gs_sql'}; } $self->SUPER::deinit(@_); }