Ignore:
Timestamp:
2018-10-30T19:29:56+13:00 (5 years ago)
Author:
ak19
Message:
  1. In GreenstoneSQLPlugout, removeold is now paramterised (as are keepold, incremental, incremental_mode). 2. Deletion on incremental_build works. But there are more questions. Why are there 4 passes? What to do on reindexing and when to do it (should it happen during GS SQL plugout or plugin)?
Location:
main/trunk/greenstone2/perllib
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gssql.pm

    r32544 r32555  
    186186   
    187187
     188    # build_mode can be removeold or incremental. We only do something special on removeold:
     189    # deleting the existing tables for this collection and recreating empty ones
    188190    if($build_mode eq "removeold") {
    189191        $self->delete_collection_tables();
  • main/trunk/greenstone2/perllib/inexport.pm

    r32540 r32555  
    690690    }
    691691    my $processor = &plugout::load_plugout($plugout);
     692    $processor->set_incremental_options($removeold, $keepold, $incremental, $incremental_mode);
    692693    $processor->setoutputdir ($archivedir);
    693694    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32544 r32555  
    4040# TODO:
    4141# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
     42# - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
     43# Test doc with meta and text like macron in Maori text.
     44# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
     45# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
     46
     47# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
     48# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
     49
     50# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
     51# Discuss the plugin/plugout parameters.
     52
     53# TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
     54
     55# TODO Q: is "reindex" = del from db + add to db?
     56# - is this okay for reindexing, or will it need to modify existing values (update table)
     57# - if it's okay, what does reindex need to accomplish (and how) if the OID changes because hash id produced is different?
     58# - delete is accomplished in GS SQL Plugin, during buildcol.pl. When should reindexing take place?
     59# during SQL plugout/import.pl or during plugin? If adding is done by GSSQLPlugout, does it need to
     60# be reimplemented in GSSQLPlugin to support the adding portion of reindexing.
     61
     62
     63# TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
     64# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
     65
     66
     67# + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental
     68# (instead of tossing away db on every build)
    4269# + Ask about docsql naming convention adopted to identify OID. Better way?
    4370# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
    4471# + Startup parameters (except removeold/build_mode)
    45 # - how do we detect we're to do removeold during plugout in import.pl phase
    46 # - incremental building: where do we need to add code to delete rows from our sql table after
     72# + how do we detect we're to do removeold during plugout in import.pl phase
     73# + incremental building: where do we need to add code to delete rows from our sql table after
    4774# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
    48 # - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
    49 # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
    50 # back in from the sql db while the remainder is to be read back in from the docsql .xml files.
    51 # - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
    52 # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order
    53 
    54 # TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
    55 # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
    56 
    57 # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
    58 # Discuss the plugin/plugout parameters.
    59 
    60 # TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
     75# + Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
     76# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order.
     77# YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column
     78
     79
     80########################################################################################
    6181
    6282# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
     
    179199    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
    180200    my %attr_hash = %_; # right way, see OAIPlugin.pm
    181     $self->{'doc_oid'} = $attr_hash{'docoid'}; 
     201    $self->{'doc_oid'} = $attr_hash{'docoid'};
     202    print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n";
    182203    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
    183204        if $self->{'verbosity'} > 2;
     
    189210}
    190211
    191 # TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
     212# TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb?
    192213
    193214# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
     
    196217sub close_document {
    197218    my $self = shift(@_);
     219
     220    print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n";
     221   
     222    my $gs_sql = $self->get_gssql_instance();
    198223   
    199224    my $outhandle = $self->{'outhandle'};
    200225    my $doc_obj = $self->{'doc_obj'};
    201     # sub read() will make the db connection setting $self->{'gs_sql'} once: the first time read()
    202     # is called on the GS SQLPlugin instance.
    203     my $gs_sql = $self->{'gs_sql'} || return; # $self->lazy_get_gssql(); # won't want to call lazy_get_gssql() if close_doc called during (incr-)import.pl, only during buildcol.pl
    204 
    205     # TODO: return statement skips "dummy" pass. Should we skip it or not?
    206     # If we don't return, gs_sql is not set for dummy pass...
    207 
     226
     227    my $build_proc_mode = $self->{'processor'}->get_mode(); # can be "text" as per basebuildproc or infodb
    208228    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
    209     print $outhandle "++++ OID of document (meta|text) to be read in from DB: $oid\n"
     229    my $proc_mode = $self->{'process_mode'};
     230   
     231    print $outhandle "++++ OID of document (meta|text) to be del or read in from DB: ".$self->{'doc_oid'}."\n"
    210232    if $self->{'verbosity'} > 2;
    211233   
    212234    # For now, we have access to doc_obj (until just before super::close_document() terminates)
    213 
     235   
    214236    # no need to call $self->{'doc_obj'}->set_OID($oid);
    215237    # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata
    216238    # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata
    217239    # Either way, Identifier meta will be read into the docobj automatically with other meta.
    218 
    219     my $proc_mode = $self->{'process_mode'};
    220     if($proc_mode eq "all" || $proc_mode eq "meta_only") {
    221     # read in meta for the collection (i.e. select * from <col>_metadata table
    222 
    223     my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
    224     print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
    225         if $self->{'verbosity'} > 2;
    226 
    227     print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
    228     # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
    229     while( my @row = $sth->fetchrow_array() ) {     
    230         #print $outhandle "row: @row\n";
    231         my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
    232 
    233         # get rid of the artificial "root" introduced in section id when saving to sql db
    234         $sid =~ s@^root@@;
    235         $sid = $doc_obj->get_top_section() unless $sid;
    236         print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
     240   
     241    if ($self->{'verbosity'} > 2) {
     242    print STDERR "+++++++++++ buildproc_mode: $build_proc_mode\n";
     243    print STDERR "+++++++++++ SQLPlug proc_mode: $proc_mode\n";
     244    }
     245
     246    # TODO: where does reindexing take place, GreenstoneSQL -Plugout or -Plugin?
     247    #if($build_proc_mode =~ m/(delete|reindex)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add)
     248    if($build_proc_mode =~ m/(delete)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add)
     249   
     250          # build_proc_mode could be "(infodb|text)(delete|reindex)"
     251          # "...delete" or "...reindex" as per ArchivesInfPlugin
     252   
     253    print STDERR "@@@@ DELETING DOC FROM SQL DB\n";
     254   
     255    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
     256        print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2;
     257        $gs_sql->delete_recs_from_metatable_with_docid($oid);
     258    }
     259    if($proc_mode eq "all" || $proc_mode eq "text_only") {
     260        print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2;
     261        $gs_sql->delete_recs_from_texttable_with_docid($oid);
     262    }
     263   
     264    # If we're reindexing the current doc, we will we want to continue: which
     265    # will add this doc ID back into the db with the new meta/full txt values
     266    # But if we're deleting, then we're done processing the document, so set doc_oid to undef
     267    # to prevent adding it back into db
     268    #undef $self->{'doc_oid'} if($build_proc_mode =~ m/delete$/);   
     269   
     270    } # done deleting doc from SQL db   
     271   
     272    else {#if($self->{'doc_oid'}) { # if loading doc from SQL db
     273    print STDERR "@@@@ LOADING DOC FROM SQL DB\n"; 
     274   
     275    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
     276        # read in meta for the collection (i.e. select * from <col>_metadata table
     277       
     278        my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
     279        print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
    237280        if $self->{'verbosity'} > 2;
    238281       
    239         # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
    240         $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
     282        print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
     283        # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
     284        while( my @row = $sth->fetchrow_array() ) {     
     285        #print $outhandle "row: @row\n";
     286        my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
     287       
     288        # get rid of the artificial "root" introduced in section id when saving to sql db
     289        $sid =~ s@^root@@;
     290        $sid = $doc_obj->get_top_section() unless $sid;
     291        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
     292            if $self->{'verbosity'} > 2;
     293       
     294        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
     295        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
     296        }
     297        print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
     298        if $self->{'verbosity'} > 2;
    241299    }
    242     print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
    243         if $self->{'verbosity'} > 2;
    244     }
    245    
    246     if($proc_mode eq "all" || $proc_mode eq "text_only") {
    247     # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
    248 
    249     my $fulltxt_table = $gs_sql->get_fulltext_table_name();
    250    
    251    
    252     my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
    253     print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2;
    254 
    255     print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
    256         if $self->{'verbosity'} > 2;
    257     while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
    258 
    259         # get rid of the artificial "root" introduced in section id when saving to sql db
    260         #$sid =~ s@^root@@;
    261         $sid = $doc_obj->get_top_section() if ($sid eq "root");
    262         print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
     300   
     301    if($proc_mode eq "all" || $proc_mode eq "text_only") {
     302        # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
     303       
     304        my $fulltxt_table = $gs_sql->get_fulltext_table_name();
     305       
     306   
     307        my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
     308        print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2;
     309       
     310        print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
    263311        if $self->{'verbosity'} > 2;
    264 
    265         # TODO - pass by ref?
    266         # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
    267         $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
    268     }   
    269     print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
    270         if $self->{'verbosity'} > 2;
    271     }
    272 
     312        while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
     313       
     314        # get rid of the artificial "root" introduced in section id when saving to sql db
     315        #$sid =~ s@^root@@;
     316        $sid = $doc_obj->get_top_section() if ($sid eq "root");
     317        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
     318            if $self->{'verbosity'} > 2;
     319       
     320        # TODO - pass by ref?
     321        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
     322        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
     323        }   
     324        print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
     325        if $self->{'verbosity'} > 2;
     326    }
     327   
     328    } # done reading into docobj from SQL db
    273329   
    274330    # don't forget to clean up on close() in superclass
     
    278334
    279335
    280 # TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl.
    281 # During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
    282 # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
    283 # Lazy connection.
    284 
    285 # Call init() not begin() because there can be multiple plugin passes
    286 # and init() should be called before all passes:
    287 # one for doc level and another for section level indexing
    288 # This way, we can connect to the SQL database once per buildcol run.
    289 #sub init {
    290 #    my ($self) = shift (@_);
    291 #    print STDERR "@@@@@@@@@@ INIT CALLED\n";
    292    
    293 #    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
    294 
    295 
    296 sub lazy_get_gssql {
     336# We want SQLPlugin to connect to db only during buildcol.pl phase, not during import.pl
     337# This works out okay, as close_document() (called by read()) is only invoked during buildcol.pl
     338#
     339# Further, we want a single db connection for the GS SQL Plugin to be used for
     340# the multiple plugin passes: for "dummy" pass, and for doc level and for section level indexing
     341# By calling the lazy loading get_sql_instance() from close_document(),
     342# we connect to the SQL database once per GSSQLPlugin and only during the buildcol phase.
     343#
     344# get_gssql_instance() is a lazy loading method that returns singleton db connection for a GreenstoneSQLPlugin object. ("Code pattern" get instance vs singleton.)
     345# One instance of db connection that can be used for all the many doc_objects processed by this plugin
     346#
     347# Except in methods get_gssql_instance() and deinit(), don't access self->{'_gs_sql'} directly.
     348# Instead, call method get_gssql_instance() and store return value in a local variable, my $gs_sql
     349#
     350sub get_gssql_instance
     351{   
    297352    my $self = shift(@_);
    298353
     
    301356                                  # if we couldn't succeed connecting on any connection attempt
    302357   
    303     return $self->{'gs_sql'} if($self->{'gs_sql'});
     358    return $self->{'_gs_sql'} if($self->{'_gs_sql'});
    304359
    305360    # assume we'll fail to connect
     
    355410   
    356411    # store db handle now that we're connected
    357     $self->{'gs_sql'} = $gs_sql;
     412    $self->{'_gs_sql'} = $gs_sql;
    358413    return $gs_sql;
    359414   
     
    370425    print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n";
    371426   
    372     if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have
     427    if($self->{'_gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have
    373428    # a value except during buildcol, so when processor =~ m/buildproc$/.
    374     $self->{'gs_sql'}->disconnect_from_db()
     429    $self->{'_gs_sql'}->disconnect_from_db()
    375430        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
    376431
    377432    # explicitly set to undef so all future use has to make the connection again
    378     undef $self->{'gs_sql'};
     433    undef $self->{'_gs_sql'};
    379434    }
    380435
     
    382437}
    383438
    384 # TODO: This can't work until GSSQLPlugout has implemented build_mode = incremental
    385 # (instead of tossing away db on every build)
    386 # then this method needs to undef $self->docid after deleting, and close_doc() has to
    387 # just return if $self->docid undefined
    388 
    389 sub read {
    390     my $self = shift (@_); 
    391  
    392     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    393 
    394     my $rv = $self->SUPER::read(@_); # defined in ReadXMLFile inherited by superclass GS XML PLugin
    395 
    396     if(defined $rv) { # undef if !can_proc_this_file, but -1 if failed to parse docsql.xml
    397 
    398     # don't want to do any GreenstoneSQLPlugin DB stuff during import.pl
    399     # only during in buildcol.pl   
    400     return if (ref($processor) !~ m/buildproc$/i);
    401 
    402     # we know we're buildcol, let's proceed:
    403 
    404     # make the connection once for the life of the plugin, not once for every doc
    405     # so that we can disconnect at the very end of the plugin's life: on deinit()
    406     # If we hadn't connected before, connect now
    407     my $gs_sql = $self->{'gs_sql'} || $self->lazy_get_gssql(); # TODO which syntax best?
    408    
    409     my $build_proc_mode = $processor->get_mode(); # can be "text" as per basebuildproc or
    410     # "textdelete" or "textreindex" as per ArchivesInfPlugin
    411     if($build_proc_mode =~ m/\.delete/) {
    412 
    413         # NOTTODO: add current doc OID stored in $self->{'doc_oid'} to list of oids get rid
    414         # of from table(s) entries. We'll do the actual deletion in deinit?? Since that's
    415         # when ArchivesInfPlugin deletes the docsql.xml files
    416        
    417         my $doc_oid = $self->{'doc_oid'};
    418         #my @delete_docids = $self->{'delete_docids'};
    419         #push (@delete_docids, $doc_oid);
    420 
    421         my $proc_mode = $self->{'process_mode'};
    422         if($proc_mode eq "all" || $proc_mode eq "meta_only") {
    423         print STDERR "@@@@@@@@ Deleting $doc_oid from meta table\n";
    424         $gs_sql->delete_recs_from_metatable_with_docid($doc_oid);
    425         }
    426         if($proc_mode eq "all" || $proc_mode eq "text_only") {
    427         print STDERR "@@@@@@@@ Deleting $doc_oid from fulltxt table\n";
    428         $gs_sql->delete_recs_from_texttable_with_docid($doc_oid);
    429         }
    430     }
    431     }
    432 
    433     return $rv;
    434    
    435 }
    436 
     439
     440
     441
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r32540 r32555  
    347347}
    348348
     349# GreenstoneSQLPlugout needs to know whether we're doing removeold or not
     350sub set_incremental_options {
     351    my $self= shift (@_);
     352    my ($removeold, $keepold, $incremental, $incremental_mode) = @_;
     353   
     354    $self->{'removeold'}        = $removeold;
     355    $self->{'keepold'}          = $keepold;
     356    $self->{'incremental'}      = $incremental;
     357    $self->{'incremental_mode'} = $incremental_mode;
     358}
     359
    349360# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
    350361sub set_OIDtype {
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32543 r32555  
    139139    #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
    140140    #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n";
    141    
    142     ########### TODO: deal with build mode #########
    143 
    144     $self->{'build_mode'} = "removeold";
    145141
    146142    ############ LOAD NECESSARY OPTIONS ###########
     
    173169   
    174170    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
    175     my $build_mode = $self->{'build_mode'} || "removeold";
     171    my $build_mode = ($self->{'removeold'}) ? "removeold" : "incremental";
     172    print STDERR "@@@@@@@@@@@@ remove_old: $build_mode\n";
     173   
    176174    if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) {
    177175   
Note: See TracChangeset for help on using the changeset viewer.