########################################################################### # # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml # Metadata and/or fulltext are stored in SQL db, the rest may be stored in # the docsql .xml files. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package GreenstoneSQLPlugin; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use DBI; use docprint; # for new unescape_text() subroutine use GreenstoneXMLPlugin; use gssql; # TODO: # - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge. # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that # back in from the sql db while the remainder is to be read back in from the docsql .xml files. # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking: basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. # Discuss the plugin/plugout parameters. # TODO, test on windows and mac. # Note: if parsing fails (e.g. using wrong plugout like GS XML plugout, which chokes on args intended for SQL plugout) then SQL plugin init would have already been called and done connection, but disconnect would not have been done because SQL plugin disconnect would not have been called upon parse failure. # DONE: # + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental # (instead of tossing away db on every build) # + Ask about docsql naming convention adopted to identify OID. Better way? # collection names -> table names: it seems hyphens not allowed. Changed to underscores. # + Startup parameters (except removeold/build_mode) # + how do we detect we're to do removeold during plugout in import.pl phase # + incremental building: where do we need to add code to delete rows from our sql table after # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? # + Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order. # YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column # + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? # NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not # running. The GLI "paralyzing" incident last time was not because of the gs sql connection code, # but because my computer was freezing on-and-off. # + "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? # Test doc with meta and text like macron in Maori text. # + TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. # This got fixed when GSSQLPlugin stopped connecting on init(). # # #+ TODO: deal with incremental vs removeold. If docs removed from import folder, then import step # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is. # # + TODO Q: is "reindex" = del from db + add to db? # - is this okay for reindexing, or will it need to modify existing values (update table) # - if it's okay, what does reindex need to accomplish (and how) if the OID changes because hash id produced is different? # - delete is accomplished in GS SQL Plugin, during buildcol.pl. When should reindexing take place? # during SQL plugout/import.pl or during plugin? If adding is done by GSSQLPlugout, does it need to # be reimplemented in GSSQLPlugin to support the adding portion of reindexing. # # INCREMENTAL REBUILDING IMPLEMENTED CORRECTLY AND WORKS: # Overriding plugins' remove_all() method covered removeold. # Overriding plugins' remove_one() method is all I needed to do for reindex and deletion # (incremental and non-incremental) to work. # but doing all this needed an overhaul of gssql.pm and its use by the GS SQL plugin and plugout. # - needed to correct plugin.pm::remove_some() to process all files # - and needed to correct GreenstoneSQLPlugin::close_document() to setOID() after all # All incremental import and buildcol worked after that: # - deleting files and running incr-import and incr-buildcol (= "incr delete"), # - deleting files and running incr-import and buildcol (="non-incr delete") # - modifying meta and doing an incr rebuild # - modifying fulltext and doing an incr rebuild # - renaming a file forces a reindex: doc is removed from db and added back in, due to remove_one() # - tested CSV file: adding some records, changing some records # + CSVPlugin test (collection csvsql) # + MetadataCSVPlugin test (modified collection sqltest to have metadata.csv refer to the # filenames of sqltest's documents) # + shared image test (collection shareimg): if 2 html files reference the same image, the docs # are indeed both reindexed if the image is modified (e.g. I replaced the image with another # of the same name) which in the GS SQL plugin/plugout case is that the 2 docs are deleted # and added in again. ######################################################################################## # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext # is still written out to doc.xml (docsql .xml), that will be processed as usual, # whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever # is written out by GreenstoneSQLPlugout into the SQL db). sub BEGIN { @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); } # This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin # So we won't have a process exp conflict here. # The structure of docsql.xml files is identical to doc.xml and the contents are similar except: # - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating # this are left inside docsql.xml within the (for meta) and/or (for txt) # - the root element Archive now has a docoid attribute: sub get_default_process_exp { my $self = shift (@_); return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename } my $process_mode_list = [ { 'name' => "meta_only", 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" }, { 'name' => "text_only", 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" }, { 'name' => "all", 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ]; my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "process_mode", 'desc' => "{GreenstoneSQLPlug.process_mode}", 'type' => "enum", 'list' => $process_mode_list, 'deft' => "all", 'reqd' => "no"}, { 'name' => "db_driver", 'desc' => "{GreenstoneSQLPlug.db_driver}", 'type' => "string", 'deft' => "mysql", 'reqd' => "yes"}, { 'name' => "db_client_user", 'desc' => "{GreenstoneSQLPlug.db_client_user}", 'type' => "string", 'deft' => "root", 'reqd' => "yes"}, { 'name' => "db_client_pwd", 'desc' => "{GreenstoneSQLPlug.db_client_pwd}", 'type' => "string", 'deft' => "", 'reqd' => "yes"}, # pwd required? { 'name' => "db_host", 'desc' => "{GreenstoneSQLPlug.db_host}", 'type' => "string", 'deft' => "127.0.0.1", 'reqd' => "yes"}, ]; my $options = { 'name' => "GreenstoneSQLPlugin", 'desc' => "{GreenstoneSQLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; # TODO: For on cancel, add a SIGTERM handler or so to call end() # or to explicitly call gs_sql->close_connection if $gs_sql def sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists); #return bless $self, $class; $self = bless $self, $class; if ($self->{'info_only'}) { # If running pluginfo, we don't need to go further. return $self; } # do anything else that needs to be done here when not pluginfo return $self; } # This is called once if removeold is set with import.pl. Most plugins will do # nothing but if a plugin does any stuff outside of creating doc obj, then # it may need to clear something. # In the case of GreenstoneSQL plugs: this is the first time we have a chance # to purge the tables of the current collection from the current site's database sub remove_all { my $self = shift (@_); my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; $self->SUPER::remove_all(@_); print STDERR " Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'}); # if we're in here, we'd already have run 'use database ;' during sub init() # so we can go ahead and delete the collection's tables my $gs_sql = $self->{'gs_sql'}; $gs_sql->delete_collection_tables(); # will delete them if they exist # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin() my $proc_mode = $self->{'process_mode'}; if($proc_mode ne "text_only") { $gs_sql->ensure_meta_table_exists(); } if($proc_mode ne "meta_only") { $gs_sql->ensure_fulltxt_table_exists(); } } # This is called per document for docs that have been deleted from the # collection. Most plugins will do nothing # but if a plugin does any stuff outside of creating doc obj, then it may need # to clear something. # remove the doc(s) denoted by oids from GS SQL db # This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building # incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm sub remove_one { my $self = shift (@_); my ($file, $oids, $archivedir) = @_; my $rv = $self->SUPER::remove_one(@_); print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n"; #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS (inherited remove_one behaviour) HERE: # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED. # WE CARE ABOUT REMOVING THE DOC_OID OF THAT IMAGE FILE FROM THE SQL DB # SO DON'T RETURN IF CAN'T_PROCESS_THIS_FILE my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc print STDERR "*****************************\nAsked to remove_one oid\n***********************\n"; print STDERR "Num oids: " . scalar (@$oids) . "\n"; my $proc_mode = $self->{'process_mode'}; foreach my $oid (@$oids) { if($proc_mode eq "all" || $proc_mode eq "meta_only") { print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2; $gs_sql->delete_recs_from_metatable_with_docid($oid); } if($proc_mode eq "all" || $proc_mode eq "text_only") { print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2; $gs_sql->delete_recs_from_texttable_with_docid($oid); } } return $rv; } sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; my $outhandle = $self->{'outhandle'}; $self->{'element'} = $element; if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ # contains the tag) # Don't access %_{'docoid'} directly: keep getting a warning message to # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. my %attr_hash = %_; # right way, see OAIPlugin.pm $self->{'doc_oid'} = $attr_hash{'docoid'}; ##print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n"; print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" if $self->{'verbosity'} > 2; } else { # let superclass GreenstoneXMLPlugin continue to process
and elements $self->SUPER::xml_start_tag(@_); } } # TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb? # We should only ever get here during the buildcol.pl phase # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, # the doc_obj in memory is processed (indexed) and then made undef. # So we have to work with doc_obj before superclass close_document() is finished. sub close_document { my $self = shift(@_); ##print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n"; my $gs_sql = $self->{'gs_sql'}; my $outhandle = $self->{'outhandle'}; my $doc_obj = $self->{'doc_obj'}; my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() my $proc_mode = $self->{'process_mode'}; # For now, we have access to doc_obj (until just before super::close_document() terminates) # OID parsed of docsql.xml file does need to be set on $doc_obj, as noticed in this case: # when a doc in import is renamed, and you do incremental import, it is marked for reindexing # (reindexing is implemented by this plugin as a delete followed by add into the sql db). # In that case, UNLESS you set the OID at this stage, the old deleted doc id (for the old doc # name) continues to exist in the index at the end of incremental rebuilding if you were to # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted # doc oids will still be listed in the index. $self->{'doc_obj'}->set_OID($oid); print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n" if $self->{'verbosity'} > 0; if($proc_mode eq "all" || $proc_mode eq "meta_only") { # read in meta for the collection (i.e. select * from _metadata table my $records = $gs_sql->select_from_metatable_matching_docid($oid, $outhandle); print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ foreach my $row (@$records) { #print $outhandle "row: @$row\n"; my ($primary_key, $did, $sid, $metaname, $metaval) = @$row; # get rid of the artificial "root" introduced in section id when saving to sql db $sid =~ s@^root@@; $sid = $doc_obj->get_top_section() unless $sid; print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" if $self->{'verbosity'} > 2; # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); } print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" if $self->{'verbosity'} > 2; } if($proc_mode eq "all" || $proc_mode eq "text_only") { # read in fulltxt for the collection (i.e. select * from _fulltxt table my $fulltxt_table = $gs_sql->get_fulltext_table_name(); my $records = $gs_sql->select_from_texttable_matching_docid($oid, $outhandle); print $outhandle "----------\nSQL DB contains txt entries for-----------\n" if $self->{'verbosity'} > 2; foreach my $row (@$records) { my ($primary_key, $did, $sid, $text) = @$row; # get rid of the artificial "root" introduced in section id when saving to sql db #$sid =~ s@^root@@; $sid = $doc_obj->get_top_section() if ($sid eq "root"); print $outhandle "### did: $did, sid: |$sid|, fulltext: \n" if $self->{'verbosity'} > 2; # TODO - pass by ref? # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: my $textref = &docprint::unescape_textref(\$text); $doc_obj->add_utf8_text($sid, $$textref); } print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" if $self->{'verbosity'} > 2; } # done reading into docobj from SQL db # don't forget to clean up on close() in superclass # It will get the doc_obj indexed then make it undef $self->SUPER::close_document(@_); } # GS SQL Plugin::init() (and deinit()) is called by import.pl and also by buildcol.pl # This means it connects and deconnects during import.pl as well. This is okay # as removeold, which should drop the collection tables, happens during the import phase, # calling GreenstoneSQLPlugin::and therefore also requires a db connection. # TODO: Eventually can try moving get_gssql_instance into gssql.pm? That way both GS SQL Plugin # and Plugout would be using one connection during import.pl phase when both plugs exist. # Call init() not begin() because there can be multiple plugin passes and begin() called for # each pass (one for doc level and another for section level indexing), whereas init() should # be called before any and all passes. # This way, we can connect to the SQL database once per buildcol run. sub init { my ($self) = shift (@_); ##print STDERR "@@@@@@@@@@ INIT CALLED\n"; $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). #################### # print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'}; # print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n"; # print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n"; # print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n"; # print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n"; # print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; #################### # create gssql object. # collection name will be used for naming tables (site name will be used for naming database) my $gs_sql = new gssql({ 'collection_name' => $ENV{'GSDLCOLLECTION'}, 'verbosity' => $self->{'verbosity'} }); # try connecting to the mysql db, die if that fails if(!$gs_sql->connect_to_db({ 'db_driver' => $self->{'db_driver'}, 'db_client_user' => $self->{'db_client_user'}, 'db_client_pwd' => $self->{'db_client_pwd'}, 'db_host' => $self->{'db_host'} }) ) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not connect to db. Can't proceed.\n"); } my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet) # Bail if we can't use the database if(!$gs_sql->use_db($db_name)) { # This is fatal for the plugout, let's terminate here after disconnecting again # PrintError would already have displayed the warning message on load fail $gs_sql->force_disconnect_from_db(); die("Could not use db $db_name. Can't proceed.\n"); } # store db handle now that we're connected $self->{'gs_sql'} = $gs_sql; } # This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol # Call deinit() not end() because there can be multiple plugin passes: # one for doc level and another for section level indexing # and deinit() should be called before all passes # This way, we can close the SQL database once per buildcol run. sub deinit { my ($self) = shift (@_); ##print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have # a value except during buildcol, so when processor =~ m/buildproc$/. $self->{'gs_sql'}->finished(); # Clear gs_sql (setting key to undef has a different meaning from deleting: # undef makes key still exist but its value is unded whereas delete deletes the key) # So all future use has to make the connection again delete $self->{'gs_sql'}; } $self->SUPER::deinit(@_); }