Changeset 32544 for main/trunk


Ignore:
Timestamp:
2018-10-26T20:12:14+13:00 (5 years ago)
Author:
ak19
Message:
  1. GreenstoneSQLPlugin: now sub read() calls the new lazy_get_gssql() method to only connect the first time and only if buildcol (when the processor var is a buildproc), so we don't connect in init() anymore as that got called during import.pl as well buildcol, whereas we only want to do connect/disconnet from DB stuff in this plugIN during buildcol.pl. 2. GSSQLPlugin and gssql now can delete entries from the SQL database. This can't be tested to work properly until GS SQL PlugOUT doesn't always assume removeold. At present it always does removeold: clearing the db even when incremental-import.pl is called. When there are no NEW docs in inc-import, the tables are cleared and nothing new is added, so can't test incremental behaviour during buildcol.pl/GS SQL PlugIN then either.
Location:
main/trunk/greenstone2/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gssql.pm

    r32543 r32544  
    401401}
    402402
     403# delete all records in metatable with specified docid
     404# https://www.tutorialspoint.com/mysql/mysql-delete-query.htm
     405# DELETE FROM table_name [WHERE Clause]
     406# see example under 'do' at https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm
     407sub delete_recs_from_metatable_with_docid {
     408    my $self= shift (@_);
     409    my ($oid) = @_;
     410   
     411    my $dbh = $self->{'db_handle'};
     412    my $meta_table = $self->get_metadata_table_name();
     413   
     414    #my $rows_deleted =
     415    $dbh->do(qq{DELETE FROM $meta_table WHERE did = ?}, undef, $oid)
     416    or warn $dbh->errstr;
     417}
     418
     419# delete all records in metatable with specified docid
     420sub delete_recs_from_texttable_with_docid {
     421    my $self= shift (@_);
     422    my ($oid) = @_;
     423   
     424    my $dbh = $self->{'db_handle'};   
     425    my $fulltxt_table = $self->get_fulltext_table_name();
     426   
     427    $dbh->do(qq{DELETE FROM $fulltxt_table WHERE did = ?}, undef, $oid)
     428    or warn $dbh->errstr;
     429}
    403430
    404431# Can call this after connection succeeded to get the database handle, dbh,
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32543 r32544  
    5858# Discuss the plugin/plugout parameters.
    5959
     60# TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
    6061
    6162# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
     
    158159
    159160    # do anything else that needs to be done here when not pluginfo
     161    #$self->{'delete_docids'} = (); # list of doc oids to delete during deinit()
    160162   
    161163    return $self;
     
    179181    $self->{'doc_oid'} = $attr_hash{'docoid'}; 
    180182    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
    181         if $self->{'verbosity'} > 1;
     183        if $self->{'verbosity'} > 2;
    182184
    183185    }
     
    196198   
    197199    my $outhandle = $self->{'outhandle'};
    198     my $doc_obj = $self->{'doc_obj'};   
    199     my $gs_sql = $self->{'gs_sql'};
     200    my $doc_obj = $self->{'doc_obj'};
     201    # sub read() will make the db connection setting $self->{'gs_sql'} once: the first time read()
     202    # is called on the GS SQLPlugin instance.
     203    my $gs_sql = $self->{'gs_sql'} || return; # $self->lazy_get_gssql(); # won't want to call lazy_get_gssql() if close_doc called during (incr-)import.pl, only during buildcol.pl
     204
     205    # TODO: return statement skips "dummy" pass. Should we skip it or not?
     206    # If we don't return, gs_sql is not set for dummy pass...
    200207
    201208    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
    202209    print $outhandle "++++ OID of document (meta|text) to be read in from DB: $oid\n"
    203     if $self->{'verbosity'} > 1;
     210    if $self->{'verbosity'} > 2;
    204211   
    205212    # For now, we have access to doc_obj (until just before super::close_document() terminates)
     
    214221    # read in meta for the collection (i.e. select * from <col>_metadata table
    215222
    216     my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
     223    my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
    217224    print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
    218         if $self->{'verbosity'} > 1;
    219 
    220     print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
     225        if $self->{'verbosity'} > 2;
     226
     227    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
    221228    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
    222229    while( my @row = $sth->fetchrow_array() ) {     
     
    228235        $sid = $doc_obj->get_top_section() unless $sid;
    229236        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
    230         if $self->{'verbosity'} > 1;
     237        if $self->{'verbosity'} > 2;
    231238       
    232239        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
     
    234241    }
    235242    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
    236         if $self->{'verbosity'} > 1;
     243        if $self->{'verbosity'} > 2;
    237244    }
    238245   
     
    244251   
    245252    my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
    246     print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
     253    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2;
    247254
    248255    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
    249         if $self->{'verbosity'} > 1;
     256        if $self->{'verbosity'} > 2;
    250257    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
    251258
     
    254261        $sid = $doc_obj->get_top_section() if ($sid eq "root");
    255262        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
    256         if $self->{'verbosity'} > 1;
     263        if $self->{'verbosity'} > 2;
    257264
    258265        # TODO - pass by ref?
     
    261268    }   
    262269    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
    263         if $self->{'verbosity'} > 1;
     270        if $self->{'verbosity'} > 2;
    264271    }
    265272
     
    274281# During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
    275282# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
     283# Lazy connection.
    276284
    277285# Call init() not begin() because there can be multiple plugin passes
     
    279287# one for doc level and another for section level indexing
    280288# This way, we can connect to the SQL database once per buildcol run.
    281 sub init {
    282     my ($self) = shift (@_);
     289#sub init {
     290#    my ($self) = shift (@_);
    283291#    print STDERR "@@@@@@@@@@ INIT CALLED\n";
    284292   
    285     $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
    286 
     293#    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
     294
     295
     296sub lazy_get_gssql {
     297    my $self = shift(@_);
     298
     299    # if we failed to successfully connect once before, don't bother attempting to connect again
     300    #return undef if(defined $self->{'failed'}); # plugin/process would have terminated with die()
     301                                  # if we couldn't succeed connecting on any connection attempt
     302   
     303    return $self->{'gs_sql'} if($self->{'gs_sql'});
     304
     305    # assume we'll fail to connect
     306    $self->{'failed'} = 1;
     307
     308    print STDERR "@@@@@@@@@@ LAZY CONNECT CALLED\n";
     309   
    287310    ####################
    288311#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
     
    328351    die("Could not use db $db_name. Can't proceed.\n");
    329352    }
     353
     354    #undef $self->{'failed'};
    330355   
    331356    # store db handle now that we're connected
    332357    $self->{'gs_sql'} = $gs_sql;
     358    return $gs_sql;
    333359   
    334360}
     
    341367sub deinit {
    342368    my ($self) = shift (@_);
    343     if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
     369   
     370    print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n";
     371   
     372    if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have
     373    # a value except during buildcol, so when processor =~ m/buildproc$/.
    344374    $self->{'gs_sql'}->disconnect_from_db()
    345375        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
    346     }
    347 #    print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
     376
     377    # explicitly set to undef so all future use has to make the connection again
     378    undef $self->{'gs_sql'};
     379    }
     380
    348381    $self->SUPER::deinit(@_);
    349382}
    350383
     384# TODO: This can't work until GSSQLPlugout has implemented build_mode = incremental
     385# (instead of tossing away db on every build)
     386# then this method needs to undef $self->docid after deleting, and close_doc() has to
     387# just return if $self->docid undefined
     388
     389sub read {
     390    my $self = shift (@_); 
     391 
     392    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
     393
     394    my $rv = $self->SUPER::read(@_); # defined in ReadXMLFile inherited by superclass GS XML PLugin
     395
     396    if(defined $rv) { # undef if !can_proc_this_file, but -1 if failed to parse docsql.xml
     397
     398    # don't want to do any GreenstoneSQLPlugin DB stuff during import.pl
     399    # only during in buildcol.pl   
     400    return if (ref($processor) !~ m/buildproc$/i);
     401
     402    # we know we're buildcol, let's proceed:
     403
     404    # make the connection once for the life of the plugin, not once for every doc
     405    # so that we can disconnect at the very end of the plugin's life: on deinit()
     406    # If we hadn't connected before, connect now
     407    my $gs_sql = $self->{'gs_sql'} || $self->lazy_get_gssql(); # TODO which syntax best?
     408   
     409    my $build_proc_mode = $processor->get_mode(); # can be "text" as per basebuildproc or
     410    # "textdelete" or "textreindex" as per ArchivesInfPlugin
     411    if($build_proc_mode =~ m/\.delete/) {
     412
     413        # NOTTODO: add current doc OID stored in $self->{'doc_oid'} to list of oids get rid
     414        # of from table(s) entries. We'll do the actual deletion in deinit?? Since that's
     415        # when ArchivesInfPlugin deletes the docsql.xml files
     416       
     417        my $doc_oid = $self->{'doc_oid'};
     418        #my @delete_docids = $self->{'delete_docids'};
     419        #push (@delete_docids, $doc_oid);
     420
     421        my $proc_mode = $self->{'process_mode'};
     422        if($proc_mode eq "all" || $proc_mode eq "meta_only") {
     423        print STDERR "@@@@@@@@ Deleting $doc_oid from meta table\n";
     424        $gs_sql->delete_recs_from_metatable_with_docid($doc_oid);
     425        }
     426        if($proc_mode eq "all" || $proc_mode eq "text_only") {
     427        print STDERR "@@@@@@@@ Deleting $doc_oid from fulltxt table\n";
     428        $gs_sql->delete_recs_from_texttable_with_docid($doc_oid);
     429        }
     430    }
     431    }
     432
     433    return $rv;
     434   
     435}
     436
Note: See TracChangeset for help on using the changeset viewer.