Changeset 32544

Show
Ignore:
Timestamp:
26.10.2018 20:12:14 (3 weeks ago)
Author:
ak19
Message:

1. GreenstoneSQLPlugin: now sub read() calls the new lazy_get_gssql() method to only connect the first time and only if buildcol (when the processor var is a buildproc), so we don't connect in init() anymore as that got called during import.pl as well buildcol, whereas we only want to do connect/disconnet from DB stuff in this plugIN during buildcol.pl. 2. GSSQLPlugin and gssql now can delete entries from the SQL database. This can't be tested to work properly until GS SQL PlugOUT doesn't always assume removeold. At present it always does removeold: clearing the db even when incremental-import.pl is called. When there are no NEW docs in inc-import, the tables are cleared and nothing new is added, so can't test incremental behaviour during buildcol.pl/GS SQL PlugIN then either.

Location:
main/trunk/greenstone2/perllib
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gssql.pm

    r32543 r32544  
    401401} 
    402402 
     403# delete all records in metatable with specified docid 
     404# https://www.tutorialspoint.com/mysql/mysql-delete-query.htm 
     405# DELETE FROM table_name [WHERE Clause] 
     406# see example under 'do' at https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm 
     407sub delete_recs_from_metatable_with_docid { 
     408    my $self= shift (@_); 
     409    my ($oid) = @_; 
     410     
     411    my $dbh = $self->{'db_handle'}; 
     412    my $meta_table = $self->get_metadata_table_name(); 
     413     
     414    #my $rows_deleted =  
     415    $dbh->do(qq{DELETE FROM $meta_table WHERE did = ?}, undef, $oid) 
     416    or warn $dbh->errstr; 
     417} 
     418 
     419# delete all records in metatable with specified docid 
     420sub delete_recs_from_texttable_with_docid { 
     421    my $self= shift (@_); 
     422    my ($oid) = @_; 
     423     
     424    my $dbh = $self->{'db_handle'};     
     425    my $fulltxt_table = $self->get_fulltext_table_name(); 
     426     
     427    $dbh->do(qq{DELETE FROM $fulltxt_table WHERE did = ?}, undef, $oid) 
     428    or warn $dbh->errstr; 
     429} 
    403430 
    404431# Can call this after connection succeeded to get the database handle, dbh, 
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32543 r32544  
    5858# Discuss the plugin/plugout parameters. 
    5959 
     60# TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? 
    6061 
    6162# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext 
     
    158159 
    159160    # do anything else that needs to be done here when not pluginfo 
     161    #$self->{'delete_docids'} = (); # list of doc oids to delete during deinit() 
    160162     
    161163    return $self; 
     
    179181    $self->{'doc_oid'} = $attr_hash{'docoid'};   
    180182    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" 
    181         if $self->{'verbosity'} > 1; 
     183        if $self->{'verbosity'} > 2; 
    182184 
    183185    } 
     
    196198     
    197199    my $outhandle = $self->{'outhandle'}; 
    198     my $doc_obj = $self->{'doc_obj'};     
    199     my $gs_sql = $self->{'gs_sql'}; 
     200    my $doc_obj = $self->{'doc_obj'}; 
     201    # sub read() will make the db connection setting $self->{'gs_sql'} once: the first time read() 
     202    # is called on the GS SQLPlugin instance. 
     203    my $gs_sql = $self->{'gs_sql'} || return; # $self->lazy_get_gssql(); # won't want to call lazy_get_gssql() if close_doc called during (incr-)import.pl, only during buildcol.pl 
     204 
     205    # TODO: return statement skips "dummy" pass. Should we skip it or not? 
     206    # If we don't return, gs_sql is not set for dummy pass... 
    200207 
    201208    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() 
    202209    print $outhandle "++++ OID of document (meta|text) to be read in from DB: $oid\n" 
    203     if $self->{'verbosity'} > 1; 
     210    if $self->{'verbosity'} > 2; 
    204211     
    205212    # For now, we have access to doc_obj (until just before super::close_document() terminates) 
     
    214221    # read in meta for the collection (i.e. select * from <col>_metadata table 
    215222 
    216     my $sth = $gs_sql->select_from_metatable_matching_docid($oid);   
     223    my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
    217224    print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n" 
    218         if $self->{'verbosity'} > 1; 
    219  
    220     print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1; 
     225        if $self->{'verbosity'} > 2; 
     226 
     227    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; 
    221228    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 
    222229    while( my @row = $sth->fetchrow_array() ) {      
     
    228235        $sid = $doc_obj->get_top_section() unless $sid; 
    229236        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" 
    230         if $self->{'verbosity'} > 1; 
     237        if $self->{'verbosity'} > 2; 
    231238         
    232239        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: 
     
    234241    } 
    235242    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 
    236         if $self->{'verbosity'} > 1; 
     243        if $self->{'verbosity'} > 2; 
    237244    } 
    238245     
     
    244251     
    245252    my $sth = $gs_sql->select_from_texttable_matching_docid($oid); 
    246     print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1; 
     253    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; 
    247254 
    248255    print $outhandle "----------\nSQL DB contains txt entries for-----------\n" 
    249         if $self->{'verbosity'} > 1; 
     256        if $self->{'verbosity'} > 2; 
    250257    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {        
    251258 
     
    254261        $sid = $doc_obj->get_top_section() if ($sid eq "root"); 
    255262        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n" 
    256         if $self->{'verbosity'} > 1; 
     263        if $self->{'verbosity'} > 2; 
    257264 
    258265        # TODO - pass by ref? 
     
    261268    }    
    262269    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" 
    263         if $self->{'verbosity'} > 1; 
     270        if $self->{'verbosity'} > 2; 
    264271    } 
    265272 
     
    274281# During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 
    275282# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. 
     283# Lazy connection. 
    276284 
    277285# Call init() not begin() because there can be multiple plugin passes 
     
    279287# one for doc level and another for section level indexing 
    280288# This way, we can connect to the SQL database once per buildcol run. 
    281 sub init { 
    282     my ($self) = shift (@_); 
     289#sub init { 
     290#    my ($self) = shift (@_); 
    283291#    print STDERR "@@@@@@@@@@ INIT CALLED\n"; 
    284292     
    285     $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). 
    286  
     293#    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). 
     294 
     295 
     296sub lazy_get_gssql { 
     297    my $self = shift(@_); 
     298 
     299    # if we failed to successfully connect once before, don't bother attempting to connect again 
     300    #return undef if(defined $self->{'failed'}); # plugin/process would have terminated with die() 
     301                                  # if we couldn't succeed connecting on any connection attempt 
     302     
     303    return $self->{'gs_sql'} if($self->{'gs_sql'}); 
     304 
     305    # assume we'll fail to connect 
     306    $self->{'failed'} = 1; 
     307 
     308    print STDERR "@@@@@@@@@@ LAZY CONNECT CALLED\n"; 
     309     
    287310    #################### 
    288311#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'}; 
     
    328351    die("Could not use db $db_name. Can't proceed.\n"); 
    329352    } 
     353 
     354    #undef $self->{'failed'}; 
    330355     
    331356    # store db handle now that we're connected 
    332357    $self->{'gs_sql'} = $gs_sql; 
     358    return $gs_sql; 
    333359     
    334360} 
     
    341367sub deinit { 
    342368    my ($self) = shift (@_); 
    343     if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl 
     369     
     370    print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; 
     371     
     372    if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have 
     373    # a value except during buildcol, so when processor =~ m/buildproc$/. 
    344374    $self->{'gs_sql'}->disconnect_from_db() 
    345375        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); 
    346     } 
    347 #    print STDERR "@@@@@@@@@@ DEINIT CALLED\n"; 
     376 
     377    # explicitly set to undef so all future use has to make the connection again 
     378    undef $self->{'gs_sql'}; 
     379    } 
     380 
    348381    $self->SUPER::deinit(@_); 
    349382} 
    350383 
     384# TODO: This can't work until GSSQLPlugout has implemented build_mode = incremental 
     385# (instead of tossing away db on every build) 
     386# then this method needs to undef $self->docid after deleting, and close_doc() has to 
     387# just return if $self->docid undefined 
     388 
     389sub read { 
     390    my $self = shift (@_);   
     391   
     392    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
     393 
     394    my $rv = $self->SUPER::read(@_); # defined in ReadXMLFile inherited by superclass GS XML PLugin 
     395 
     396    if(defined $rv) { # undef if !can_proc_this_file, but -1 if failed to parse docsql.xml 
     397 
     398    # don't want to do any GreenstoneSQLPlugin DB stuff during import.pl 
     399    # only during in buildcol.pl     
     400    return if (ref($processor) !~ m/buildproc$/i); 
     401 
     402    # we know we're buildcol, let's proceed: 
     403 
     404    # make the connection once for the life of the plugin, not once for every doc 
     405    # so that we can disconnect at the very end of the plugin's life: on deinit() 
     406    # If we hadn't connected before, connect now 
     407    my $gs_sql = $self->{'gs_sql'} || $self->lazy_get_gssql(); # TODO which syntax best? 
     408     
     409    my $build_proc_mode = $processor->get_mode(); # can be "text" as per basebuildproc or 
     410    # "textdelete" or "textreindex" as per ArchivesInfPlugin 
     411    if($build_proc_mode =~ m/\.delete/) { 
     412 
     413        # NOTTODO: add current doc OID stored in $self->{'doc_oid'} to list of oids get rid  
     414        # of from table(s) entries. We'll do the actual deletion in deinit?? Since that's 
     415        # when ArchivesInfPlugin deletes the docsql.xml files 
     416         
     417        my $doc_oid = $self->{'doc_oid'}; 
     418        #my @delete_docids = $self->{'delete_docids'}; 
     419        #push (@delete_docids, $doc_oid); 
     420 
     421        my $proc_mode = $self->{'process_mode'}; 
     422        if($proc_mode eq "all" || $proc_mode eq "meta_only") { 
     423        print STDERR "@@@@@@@@ Deleting $doc_oid from meta table\n"; 
     424        $gs_sql->delete_recs_from_metatable_with_docid($doc_oid); 
     425        } 
     426        if($proc_mode eq "all" || $proc_mode eq "text_only") { 
     427        print STDERR "@@@@@@@@ Deleting $doc_oid from fulltxt table\n"; 
     428        $gs_sql->delete_recs_from_texttable_with_docid($doc_oid); 
     429        } 
     430    } 
     431    } 
     432 
     433    return $rv; 
     434     
     435} 
     436