########################################################################### # # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml # Metadata and/or fulltext are stored in SQL db, the rest may be stored in # the docsql .xml files. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package GreenstoneSQLPlugin; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use DBI; use docprint; # for new unescape_text() subroutine use GreenstoneXMLPlugin; use gsmysql; #use unicode; #use Encode; ######################################################################################## # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext # is still written out to doc.xml (docsql .xml), that will be processed as usual, # whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever # is written out by GreenstoneSQLPlugout into the SQL db). sub BEGIN { @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); } # This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin # So we won't have a process exp conflict here. # The structure of docsql.xml files is identical to doc.xml and the contents are similar except: # - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating # this are left inside docsql.xml within the (for meta) and/or (for txt) # - the root element Archive now has a docoid attribute: sub get_default_process_exp { my $self = shift (@_); return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin } my $process_mode_list = [ { 'name' => "meta_only", 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" }, { 'name' => "text_only", 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" }, { 'name' => "all", 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ]; my $rollback_on_cancel_list = [ { 'name' => "true", 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}" }, { 'name' => "false", 'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ]; # NOTE: If subclassing gsmysql for other supporting databases and if they have different required # connection parameters, we can check how WordPlugin, upon detecting Word is installed, # dynamically loads Word specific configuration options. my $arguments = [ { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "process_mode", 'desc' => "{GreenstoneSQLPlug.process_mode}", 'type' => "enum", 'list' => $process_mode_list, 'deft' => "all", 'reqd' => "no"}, { 'name' => "rollback_on_cancel", 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}", 'type' => "enum", 'list' => $rollback_on_cancel_list, 'deft' => "false", # better default than true 'reqd' => "no", 'hiddengli' => "no"}, { 'name' => "db_driver", 'desc' => "{GreenstoneSQLPlug.db_driver}", 'type' => "string", 'deft' => "mysql", 'reqd' => "yes"}, { 'name' => "db_client_user", 'desc' => "{GreenstoneSQLPlug.db_client_user}", 'type' => "string", 'deft' => "root", 'reqd' => "yes"}, { 'name' => "db_client_pwd", 'desc' => "{GreenstoneSQLPlug.db_client_pwd}", 'type' => "string", 'deft' => "", 'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd { 'name' => "db_host", 'desc' => "{GreenstoneSQLPlug.db_host}", 'type' => "string", 'deft' => "127.0.0.1", # NOTE: make this int? No default for port, since it's not a required connection param 'reqd' => "yes"}, { 'name' => "db_port", 'desc' => "{GreenstoneSQLPlug.db_port}", 'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param 'reqd' => "no"} ]; my $options = { 'name' => "GreenstoneSQLPlugin", 'desc' => "{GreenstoneSQLPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; ###### Methods called during buildcol and import ####### sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists); #return bless $self, $class; $self = bless $self, $class; if ($self->{'info_only'}) { # If running pluginfo, we don't need to go further. return $self; } # do anything else that needs to be done here when not pluginfo return $self; } # Call init() not begin() because there can be multiple plugin passes and begin() called for # each pass (one for doc level and another for section level indexing), whereas init() should # be called before any and all passes. # This way, we can connect to the SQL database once per buildcol run. # Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a # singleton db connection, regardless of the number of gsmysql objects instantiated and # the number of connect() calls made on them. sub init { my ($self) = shift (@_); $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). # create gsmysql object. # collection name will be used for naming tables (site name will be used for naming database) my $gs_sql = new gsmysql({ 'collection_name' => $ENV{'GSDLCOLLECTION'}, 'verbosity' => $self->{'verbosity'} || 0 }); # if autocommit is set, there's no rollback support my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0; # try connecting to the mysql db, die if that fails if(!$gs_sql->connect_to_db({ 'db_driver' => $self->{'db_driver'}, 'db_client_user' => $self->{'db_client_user'}, 'db_client_pwd' => $self->{'db_client_pwd'}, 'db_host' => $self->{'db_host'}, 'db_port' => $self->{'db_port'}, # undef by default, can leave as is 'autocommit' => $autocommit }) ) { # This is fatal for the plugout, let's terminate here # PrintError would already have displayed the warning message on connection fail die("Could not connect to db. Can't proceed.\n"); } my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet) # Bail if we can't use the database if(!$gs_sql->use_db($db_name)) { # This is fatal for the plugout, let's terminate here after disconnecting again # PrintError would already have displayed the warning message on load fail # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db die("Could not use db $db_name. Can't proceed.\n"); } # store db handle now that we're connected $self->{'gs_sql'} = $gs_sql; } # This method also runs on import.pl if gs_sql has a value. # Call deinit() not end() because there can be multiple plugin passes: # one for doc level and another for section level indexing # and deinit() should be called before all passes # This way, we can close the SQL database once per buildcol run. # Again, this doesn't matter because we gsmysql the ensures the connection # is a singleton connection instance, which connects once and disconnects once per perl process. sub deinit { my ($self) = shift (@_); if($self->{'gs_sql'}) { # Important to call finished(): # it will disconnect from db if this is the last gsmysql instance, # and it will commit to db before disconnecting if rollbback_on_cancel turned on $self->{'gs_sql'}->finished(); # Clear gs_sql (setting key to undef has a different meaning from deleting: # undef makes key still exist but its value is unded whereas delete deletes the key) # So all future use has to make the connection again delete $self->{'gs_sql'}; } $self->SUPER::deinit(@_); } ###### Methods only called during import.pl ##### # This is called once if removeold is set with import.pl. Most plugins will do # nothing but if a plugin does any stuff outside of creating doc obj, then # it may need to clear something. # In the case of GreenstoneSQL plugs: this is the first time we have a chance # to purge the tables of the current collection from the current site's database sub remove_all { my $self = shift (@_); my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; $self->SUPER::remove_all(@_); print STDERR " Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'}); # if we're in here, we'd already have run 'use database ;' during sub init() # so we can go ahead and delete the collection's tables my $gs_sql = $self->{'gs_sql'}; $gs_sql->delete_collection_tables(); # will delete them if they exist # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin() my $proc_mode = $self->{'process_mode'}; if($proc_mode ne "text_only") { $gs_sql->ensure_meta_table_exists(); } if($proc_mode ne "meta_only") { $gs_sql->ensure_fulltxt_table_exists(); } } # This is called during import.pl per document for docs that have been deleted from the # collection. Most plugins will do nothing # but if a plugin does any stuff outside of creating doc obj, then it may need # to clear something. # In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db. # This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building # incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm. # As well as cases involving reindexing, which are implemented here as delete followed by add. sub remove_one { my $self = shift (@_); my ($file, $oids, $archivedir) = @_; my $rv = $self->SUPER::remove_one(@_); print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n"; #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS (inherited remove_one behaviour) HERE: # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED. # WE CARE ABOUT REMOVING THE DOC_OID OF THAT IMAGE FILE FROM THE SQL DB # SO DON'T RETURN IF CAN'T_PROCESS_THIS_FILE my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc my $proc_mode = $self->{'process_mode'}; foreach my $oid (@$oids) { if($proc_mode eq "all" || $proc_mode eq "meta_only") { print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2; $gs_sql->delete_recs_from_metatable_with_docid($oid); } if($proc_mode eq "all" || $proc_mode eq "text_only") { print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2; $gs_sql->delete_recs_from_texttable_with_docid($oid); } } return $rv; } ##### Methods called only during buildcol ##### sub xml_start_tag { my $self = shift(@_); my ($expat, $element) = @_; my $outhandle = $self->{'outhandle'}; $self->{'element'} = $element; if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ # contains the tag) # Don't access %_{'docoid'} directly: keep getting a warning message to # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. my %attr_hash = %_; # right way, see OAIPlugin.pm $self->{'doc_oid'} = $attr_hash{'docoid'}; print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" if $self->{'verbosity'} > 2; } else { # let superclass GreenstoneXMLPlugin continue to process
and elements $self->SUPER::xml_start_tag(@_); } } # There are multiple passes processing the document (see buildcol's mode parameter description): # - compressing the text which may be a dummy pass for lucene/solr, wherein they still want the # docobj for different purposes, # - the pass(es) for indexing, e.g. doc/didx and section/sidx level passes # - and an infodb pass for processing the classifiers. This pass too needs the docobj # Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory # We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase. # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, # the doc_obj in memory is processed (indexed) and then made undef. # So we have to work with doc_obj before superclass close_document() is finished. sub close_document { my $self = shift(@_); my $gs_sql = $self->{'gs_sql'}; my $outhandle = $self->{'outhandle'}; my $doc_obj = $self->{'doc_obj'}; my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() my $proc_mode = $self->{'process_mode'}; # For now, we have access to doc_obj (until just before super::close_document() terminates) # OID parsed of docsql.xml file does need to be set on $doc_obj, as noticed in this case: # when a doc in import is renamed, and you do incremental import, it is marked for reindexing # (reindexing is implemented by this plugin as a delete followed by add into the sql db). # In that case, UNLESS you set the OID at this stage, the old deleted doc id (for the old doc # name) continues to exist in the index at the end of incremental rebuilding if you were to # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted # doc oids will still be listed in the index. print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n" if $self->{'verbosity'}; if($proc_mode eq "all" || $proc_mode eq "meta_only") { # read in meta for the collection (i.e. select * from _metadata table my $records = $gs_sql->select_from_metatable_matching_docid($oid, $outhandle); print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; foreach my $row (@$records) { my ($primary_key, $did, $sid, $metaname, $metaval) = @$row; # don't allow duplicate toplevel OID, as can happen when a doc has been renamed and requires reindexing # TODO: any other meta that should not be duplicated, but can have been changed between rebuilds so that we need to call set_meta instead of add_meta? # e.g. FileSize, SourceFile. But Language can be multiple, so gs meta is not guaranteed to be unique either. Whereas of dc metadata we know # that some if not all should allow multiple entires for the same meta name, e.g. dc.Author/dc.Creator if($sid =~ m@^root@ && $metaname eq "Identifier") { # doc_obj's toplevel Identifier metadata is a special case: # it should have only one value, so instead of add_meta() that will allow old Identifier meta to linger # Need to do set_meta(). We then break out of the loop, to prevent duplicates (older values from DB) to be inserted for Identifier into doc_obj # Handles the case where a doc was renamed and rebuilding triggers re-indexing case: old identifier is now overwritten with new one $self->{'doc_obj'}->set_OID($oid); # calls doc_obj->set_metadata_element(top_section, Identifier, $oid). Sets OID if one doesn't exist. next; # ensures Identifier set only once, and ensure Identifier is set to current docOID for the doc, a.o.t. allowing it to be set to any expired docOID from before a doc got renamed. } # process all other metadata the normal way: # get rid of the artificial "root" introduced in section id when saving to sql db $sid =~ s@^root@@; $sid = $doc_obj->get_top_section() unless $sid; print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" if $self->{'verbosity'} > 2; # We're only dealing with utf8 data where docobj is concerned # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db $doc_obj->add_utf8_metadata($sid, $metaname, $metaval); } # UTF8 debugging, e.g. if we have a with macron in dc.Title ##binmode(STDERR, ":utf8"); # not necessary when calling utf8decomp to print wide characters as unicode codept, avoiding 'wide character' warnings. #my $dcTitle = $doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title"); #print STDERR "###### dc.Title: ".&unicode::utf8decomp($dcTitle)."\n"; #print STDERR "###### DECODED dc.Title: ".&unicode::utf8decomp(&Encode::decode_utf8($dcTitle))."\n"; # decoding was needed for perl 5.18 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" if $self->{'verbosity'} > 2; } if($proc_mode eq "all" || $proc_mode eq "text_only") { # read in fulltxt for the collection (i.e. select * from _fulltxt table my $fulltxt_table = $gs_sql->get_fulltext_table_name(); my $records = $gs_sql->select_from_texttable_matching_docid($oid, $outhandle); print $outhandle "----------\nSQL DB contains txt entries for-----------\n" if $self->{'verbosity'} > 2; foreach my $row (@$records) { my ($primary_key, $did, $sid, $text) = @$row; # get rid of the artificial "root" introduced in section id when saving to sql db #$sid =~ s@^root@@; $sid = $doc_obj->get_top_section() if ($sid eq "root"); print $outhandle "### did: $did, sid: |$sid|, fulltext: \n" if $self->{'verbosity'} > 2; # We're only dealing with utf8 data where docobj is concerned # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db $doc_obj->add_utf8_textref($sid, \$text); } print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" if $self->{'verbosity'} > 2; } # done reading into docobj from SQL db # don't forget to clean up on close() in superclass # It will get the doc_obj indexed then make it undef $self->SUPER::close_document(@_); } 1;