###########################################################################
#
# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml
# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
# the docsql .xml files.
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the 
# University of Waikato, New Zealand.
#
# Copyright (C) 2001 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################

package GreenstoneSQLPlugin;


use strict;
no strict 'refs'; # allow filehandles to be variables and viceversa

use DBI;
use docprint; # for new unescape_text() subroutine
use GreenstoneXMLPlugin;
use gssql;


# TODO:
# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
# Ask about docsql naming convention adopted to identify OID. Better way?
# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
# - Startup parameters
# - incremental building: where do we need to add code to delete rows from our sql table after
# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
# - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in?
# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that 
# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
# - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order

# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
# is still written out to doc.xml (docsql .xml), that will be processed as usual,
# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
# is written out by GreenstoneSQLPlugout into the SQL db).

# TODO:
# no more docoid in docsql .xml filename, set OID as attribute of root element inside docsql.xml file instead
# and parse it out

# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is

sub BEGIN {
    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
}

# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
# So we won't have a process exp conflict here.
sub get_default_process_exp {
    my $self = shift (@_);

    #return q^(?i)docsql(-\d+)?\.xml$^;
    return q^(?i)docsql(-.+)?\.xml$^;
}

my $process_mode_list =
    [ { 'name' => "meta_only",
        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },      
      { 'name' => "text_only",
        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
      { 'name' => "all",
        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];

my $arguments =
    [ { 'name' => "process_exp",
	'desc' => "{BaseImporter.process_exp}",
	'type' => "regexp",
	'deft' => &get_default_process_exp(),
	'reqd' => "no" },
      { 'name' => "process_mode", 
	'desc' => "{GreenstoneSQLPlug.process_mode}",
	'type' => "enum",
	'list' => $process_mode_list,
	'deft' => "all",
	'reqd' => "no"},
      { 'name' => "db_driver", 
	'desc' => "{GreenstoneSQLPlug.db_driver}",
	'type' => "string",	
	'deft' => "mysql",
	'reqd' => "yes"},
      { 'name' => "db_client_user", 
	'desc' => "{GreenstoneSQLPlug.db_client_user}",
	'type' => "string",	
	'deft' => "root",
	'reqd' => "yes"},
      { 'name' => "db_client_pwd", 
	'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
	'type' => "string",
	'deft' => "",
	'reqd' => "yes"}, # pwd required?
      { 'name' => "db_host", 
	'desc' => "{GreenstoneSQLPlug.db_host}",
	'type' => "string",
	'deft' => "127.0.0.1",
	'reqd' => "yes"},
      { 'name' => "db_encoding", 
	'desc' => "{GreenstoneSQLPlug.db_encoding}",
	'type' => "string",
	'deft' => "utf8",
	'reqd' => "yes"}
    ];

my $options = { 'name'     => "GreenstoneSQLPlugin",
		'desc'     => "{GreenstoneSQLPlugin.desc}",
		'abstract' => "no",
		'inherits' => "yes",
	        'args'     => $arguments };


# TODO: For on cancel, add a SIGTERM handler or so to call end()
# or to explicitly call gs_sql->close_connection if $gs_sql def

sub new {
    my ($class) = shift (@_);
    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    push(@$pluginlist, $class);

    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    push(@{$hashArgOptLists->{"OptList"}},$options);

    my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);

    
    #return bless $self, $class;
    $self = bless $self, $class;
    if ($self->{'info_only'}) {
	# If running pluginfo, we don't need to go further.
	return $self;
    }

    # do anything else that needs to be done here when not pluginfo
    
    return $self;
}


# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?

# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
# the doc_obj in memory is processed (indexed) and then made undef.
# So we have to work with doc_obj before superclass close_document() is finished.
sub close_document {
    my $self = shift(@_);
    
    my $outhandle = $self->{'outhandle'};
    my $doc_obj = $self->{'doc_obj'};    
    my $gs_sql = $self->{'gs_sql'};

    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read() 
    print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n"
	if $self->{'verbosity'} > 1;

    
    # For now, we have access to doc_obj (until just before super::close_document() terminates)
    
    $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index?
    
    # checking that complicated looking method set_OID() hasn't modified oid
    if($oid ne $self->{'doc_obj'}->get_OID()) {
	print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n";
    }

    
    # TODO: This function is called on a per doc.xml file basis
    # but we can process all docs of a collection in one go when dealing with the SQL tables for
    # the collection. How and where should we read in the collection tables then?
    # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl
    # and if that file is detected, then MySQLPlugin::read() is passed in that file during
    # buildcol.pl. And that file will trigger reading the 2 tables for the collection???
    my $proc_mode = $self->{'process_mode'};
    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
	# read in meta for the collection (i.e. select * from <col>_metadata table

	my $sth = $gs_sql->select_from_metatable_matching_docid($oid);	
	print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;

	print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
	# https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
	while( my @row = $sth->fetchrow_array() ) {	    
	    #print $outhandle "row: @row\n";
	    my ($primary_key, $did, $sid, $metaname, $metaval) = @row;

	    # get rid of the artificial "root" introduced in section id when saving to sql db
	    $sid =~ s@^root@@;
	    $sid = $doc_obj->get_top_section() unless $sid;
	    print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
		if $self->{'verbosity'} > 1;
	    
	    # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
	    $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
	}
	print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
	    if $self->{'verbosity'} > 1;
    }
    
    if($proc_mode eq "all" || $proc_mode eq "text_only") {
	# read in fulltxt for the collection (i.e. select * from <col>_fulltxt table

	my $fulltxt_table = $gs_sql->get_fulltext_table_name();
	
	
	my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
	print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;

	print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
	    if $self->{'verbosity'} > 1;
	while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {	   

	    # get rid of the artificial "root" introduced in section id when saving to sql db
	    #$sid =~ s@^root@@;
	    $sid = $doc_obj->get_top_section() if ($sid eq "root");
	    print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
		if $self->{'verbosity'} > 1;

	    # TODO - pass by ref?
	    # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
	    $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
	}	
	print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
	    if $self->{'verbosity'} > 1;
    }

    
    # don't forget to clean up on close() in superclass
    # It will get the doc_obj indexed then make it undef
    $self->SUPER::close_document(@_);
}


# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl
# call init() not begin() because there can be multiple plugin passes
# and init() should be called before all passes:
# one for doc level and another for section level indexing
# This way, we can connect to the SQL database once per buildcol run.
sub init {
    my ($self) = shift (@_);
#    print STDERR "@@@@@@@@@@ INIT CALLED\n";
    
    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().

    ####################
#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
#    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";

#    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
#    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
#    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
#    print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n";
#    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
    ####################
    
    my $gs_sql = new gssql({
	'collection_name' => $ENV{'GSDLCOLLECTION'},	
	'db_encoding' => $self->{'db_encoding'}
			   }
	);

    # try connecting to the mysql db, if that fails it will die
    if(!$gs_sql->connect_to_db({
	'db_driver' => $self->{'db_driver'},
	'db_client_user' => $self->{'db_client_user'},
	'db_client_pwd' => $self->{'db_client_pwd'},
	'db_host' => $self->{'db_host'}
			       })
	)
    {
	# This is fatal for the plugout, let's terminate here
	# PrintError would already have displayed the warning message on connection fail	
	die("Could not connect to db. Can't proceed.\n");
    }
    
    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
    #my $build_mode = $self->{'build_mode'} || "removeold";

    # the db and its tables should exist. Attempt to use the db:
    if(!$gs_sql->use_db($db_name)) {
	
	# This is fatal for the plugout, let's terminate here after disconnecting again
	# PrintError would already have displayed the warning message on load fail
	$gs_sql->disconnect_from_db()
	    || warn("Unable to disconnect from database.\n");
	die("Could not use db $db_name. Can't proceed.\n");
    }
    
    # store db handle now that we're connected
    $self->{'gs_sql'} = $gs_sql;
    
}

# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
# Call deinit() not end() because there can be multiple plugin passes:
# one for doc level and another for section level indexing
# and deinit() should be called before all passes
# This way, we can close the SQL database once per buildcol run.
sub deinit {
    my ($self) = shift (@_);
    if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
	$self->{'gs_sql'}->disconnect_from_db()
	    || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
    }
#    print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
    $self->SUPER::deinit(@_);
}

sub read {
    my $self = shift (@_);  
  
    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;

    # when running buildcol.pl, the filename should match "docsql-<OID>.xml"
    # when running import.pl it will be the original document's filename
    # we only want to read in from db when running buildcol.pl

    # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)
    # is created in super::open_document() and is made undef again on super::close_document().
    # Further, can't read it in from doc.xml to work out which OID to query in sql db:
    # even if we got access to doc_obj, if no meta stored in docsql.xml, then when
    # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename
    # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self
    if($file =~ m/docsql-(.+?)\.xml$/) {

	# work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing
	# subfolder besides filename, e.g. "dir/docsql-<OID>.xml"
	
	# https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original
	(my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;

	$self->{'doc_oid'} = $oid;
    }    

    
    # always read docsql.xml, as we then know doc structure, and assoc files are dealt with
    # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db
    return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj

    
}