Changeset 32563


Ignore:
Timestamp:
2018-11-02T19:07:16+13:00 (3 years ago)
Author:
ak19
Message:
  1. Overhaul of GreenstoneSQLPlugs to handle removeold and incremental delete correctly. And now code also automatically handles 'non-incremental delete' (see mention in ArchivesInfPlugin). The new version no longer does lazy loading for getting the sql db connection in the GS SQL Plugin, as now the connection needs to be active since the start of the plugin to run SQL delete statements on remove_old. So the db connection code for the GS SQL plugin has moved back into its init() method. Lots of changes to gssql.pm (and some flow on effects to the GS SQL Plugout) as when database tables exist and need to be created have changed. 2. Undoing most of the changes of changeset 32555 since we're doing incremental delete and removeold differently and in the correct way now when using the GreenstoneSQLPlugs.
Location:
main/trunk/greenstone2/perllib
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gssql.pm

    r32561 r32563  
    4444
    4545# Parameterise (one or more methods may use them):
    46 # - build_mode (like removeold)
    4746# - db_name (which is the GS3 sitename)
    4847
     
    134133    print STDERR "Away to make connection to $db_driver database with:\n";
    135134    print STDERR " - hostname $db_host; username: $db_user";
    136     print STDERR "; and the password provided" if $db_pwd;
     135    print STDERR "; and the provided password" if $db_pwd;
    137136    print STDERR "\nAssuming the mysql server has been started with: --character_set_server=utf8mb4\n" if $db_driver eq "mysql";
    138137    }
     
    188187}
    189188
    190 # will attempt to load the specified db and the <coll>_metadata and <coll>_fulltxt for this
    191 # collection, or create any of these (db, tables) that don't yet exist. At the end
    192 # it will have loaded the requested database (in MySQL: "use <db>;")
    193 sub load_db_and_tables {
    194     my $self= shift (@_);
    195     my ($db_name, $build_mode) = @_;
     189# Load the designated database, i.e. 'use <dbname>;'.
     190# If the database doesn't yet exist, creates it and loads it.
     191# (Don't create the collection's tables yet, though)
     192# At the end it will have loaded the requested database (in MySQL: "use <db>;") on success.
     193# As usual, returns success or failure value that can be evaluated in a boolean context.
     194sub use_db {
     195    my $self= shift (@_);
     196    my ($db_name) = @_;
    196197    my $dbh = $self->{'db_handle'};
    197198    $db_name = $self->sanitize_name($db_name);
     
    215216    # once more attempt to use db, now that it exists
    216217    $dbh->do("use $db_name") || return 0;
    217     #$dbh->do("use localsite") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n";
    218 
    219     # attempt to create tables in current db
    220     $self->create_metadata_table() || return 0;
    221     $self->create_fulltext_table() || return 0;
     218    #$dbh->do("use $db_name") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n";
    222219
    223220    $success = 1;
     
    227224
    228225    print STDERR "@@@ DATABASE $db_name EXISTED\n" if($self->{'verbosity'} > 2);
    229    
    230 
    231     # build_mode can be removeold or incremental. We only do something special on removeold:
    232     # deleting the existing tables for this collection and recreating empty ones
    233     if($build_mode eq "removeold") {
    234         print STDERR "   Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'});
    235         $self->delete_collection_tables();
    236     }
    237 
    238     # use existing tables if any
    239     # attempt to create tables in current db   
    240     if($build_mode eq "removeold" || !$self->table_exists($self->get_metadata_table_name())) {
    241         $self->create_metadata_table() || return 0;
    242     } else {
    243         print STDERR "@@@ Meta table exists\n" if($self->{'verbosity'} > 2);
    244     }
    245     if($build_mode eq "removeold" || !$self->table_exists($self->get_fulltext_table_name())) {
    246         $self->create_fulltext_table() || return 0;
    247     } else {
    248         print STDERR "@@@ Fulltxt table exists\n" if($self->{'verbosity'} > 2);
    249     }
    250    
    251226    }
    252227   
     
    254229}
    255230
    256 # GreenstoneSQLPlugin calls this method to load an existing db.
    257 # This will terminate if the db does not exist. Unlike load_db_and_tables() above, used by
    258 # GreenstoneSQLPlugout, this method will not attempt to create the requested db (nor its tables)
    259 # TODO: GS SQLPlugin is called before GS SQLPlugout and attempts to use_db() - called in plugin's
    260 # init() method. This will fail if the db does not exist. Ideally want the gssqlplugin only called
    261 # during buildcol.pl
    262 sub use_db {
    263     my $self= shift (@_);
    264     my ($db_name) = @_;
    265     my $dbh = $self->{'db_handle'};
    266     $db_name = $self->sanitize_name($db_name);
    267 
    268    
    269     print STDERR "Loading database $db_name\n" if($self->{'verbosity'} > 1);
    270    
    271     # perl DBI switch database: https://www.perlmonks.org/?node_id=995434
    272     # do() returns undef on error.
    273     # connection succeeded, try to load our database. If that didn't work, attempt to create db
    274     return $dbh->do("use $db_name") || warn();
     231# We should already have done "use <database>;" if this gets called.
     232# Just load this collection's metatable
     233sub ensure_meta_table_exists {
     234    my $self = shift (@_);
     235   
     236    my $tablename = $self->get_metadata_table_name();
     237    if(!$self->table_exists($tablename)) {
     238    #print STDERR "   Creating metadata table $tablename\n" if($self->{'verbosity'} > 1);
     239    $self->create_metadata_table() || return 0;
     240    } else {
     241    print STDERR "@@@ Meta table exists\n" if($self->{'verbosity'} > 2);
     242    }
     243    return 1;
     244}
     245
     246# We should already have done "use <database>;" if this gets called.
     247# Just load this collection's metatable
     248sub ensure_fulltxt_table_exists {
     249    my $self = shift (@_);
     250   
     251    my $tablename = $self->get_fulltext_table_name();   
     252    if(!$self->table_exists($tablename)) {
     253    #print STDERR "   Creating fulltxt table $tablename\n" if($self->{'verbosity'} > 1);
     254    $self->create_fulltext_table() || return 0;
     255    } else {
     256    print STDERR "@@@ Fulltxt table exists\n" if($self->{'verbosity'} > 2);
     257    }
     258    return 1;
    275259}
    276260
  • main/trunk/greenstone2/perllib/inexport.pm

    r32555 r32563  
    690690    }
    691691    my $processor = &plugout::load_plugout($plugout);
    692     $processor->set_incremental_options($removeold, $keepold, $incremental, $incremental_mode);
    693692    $processor->setoutputdir ($archivedir);
    694693    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32560 r32563  
    4040# TODO:
    4141# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
    42 # - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
    43 # Test doc with meta and text like macron in Maori text.
    4442# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
    4543# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
     
    4745# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
    4846# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
    49 
    50 # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
    51 # Discuss the plugin/plugout parameters.
    52 
    53 # + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
    54 # NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not
    55 # running. The GLI "paralyzing" incident last time was not because of the gs sql connection code,
    56 # but because my computer was freezing on-and-off.
    5747
    5848# TODO Q: is "reindex" = del from db + add to db?
     
    6353# be reimplemented in GSSQLPlugin to support the adding portion of reindexing.
    6454
    65 
    66 # TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
    67 # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
    68 
    69 
     55# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking: basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
     56# Discuss the plugin/plugout parameters.
     57
     58
     59# DONE:
    7060# + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental
    7161# (instead of tossing away db on every build)
     
    7969# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order.
    8070# YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column
     71# + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
     72#   NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not
     73# running. The GLI "paralyzing" incident last time was not because of the gs sql connection code,
     74# but because my computer was freezing on-and-off.
     75# + "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
     76# Test doc with meta and text like macron in Maori text.
     77# + TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
     78# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
     79#   This got fixed when GSSQLPlugin stopped connecting on init().
    8180
    8281
     
    177176
    178177    # do anything else that needs to be done here when not pluginfo
    179     #$self->{'delete_docids'} = (); # list of doc oids to delete during deinit()
    180178   
    181179    return $self;
    182180}
    183181
    184 sub xml_start_tag {
    185     my $self = shift(@_);
    186     my ($expat, $element) = @_;
    187 
    188     my $outhandle = $self->{'outhandle'};
    189    
    190     $self->{'element'} = $element;
    191     if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
    192     # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
    193     # contains the tag)
    194 
    195     # Don't access %_{'docoid'} directly: keep getting a warning message to
    196     # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
    197     # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
    198     my %attr_hash = %_; # right way, see OAIPlugin.pm
    199     $self->{'doc_oid'} = $attr_hash{'docoid'};
    200     print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n";
    201     print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
    202         if $self->{'verbosity'} > 2;
    203 
    204     }
    205     else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
    206     $self->SUPER::xml_start_tag(@_);
    207     }
    208 }
    209 
    210 # TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb?
    211 
    212 # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
    213 # the doc_obj in memory is processed (indexed) and then made undef.
    214 # So we have to work with doc_obj before superclass close_document() is finished.
    215 sub close_document {
    216     my $self = shift(@_);
    217 
    218     print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n";
    219    
    220     my $gs_sql = $self->get_gssql_instance();
    221    
    222     my $outhandle = $self->{'outhandle'};
    223     my $doc_obj = $self->{'doc_obj'};
    224 
    225     my $build_proc_mode = $self->{'processor'}->get_mode(); # can be "text" as per basebuildproc or infodb
    226     my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
     182# This is called once if removeold is set with import.pl. Most plugins will do
     183# nothing but if a plugin does any stuff outside of creating doc obj, then
     184# it may need to clear something.
     185# In the case of GreenstoneSQL plugs: this is the first time we have a chance
     186# to purge the tables of the current collection from the current site's database
     187sub remove_all {
     188    my $self = shift (@_);
     189    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
     190
     191    print STDERR "   Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'});
     192   
     193    # if we're in here, we'd already have run 'use database <site_name>;' during sub init()
     194    # so we can go ahead and delete the collection's tables
     195    my $gs_sql = $self->{'gs_sql'};
     196    $gs_sql->delete_collection_tables(); # will delete them if they exist
     197
     198    # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin()
    227199    my $proc_mode = $self->{'process_mode'};
    228    
    229     print $outhandle "++++ OID of document (meta|text) to be del or read in from DB: ".$self->{'doc_oid'}."\n"
    230     if $self->{'verbosity'} > 2;
    231    
    232     # For now, we have access to doc_obj (until just before super::close_document() terminates)
    233    
    234     # no need to call $self->{'doc_obj'}->set_OID($oid);
    235     # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata
    236     # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata
    237     # Either way, Identifier meta will be read into the docobj automatically with other meta.
    238    
    239     if ($self->{'verbosity'} > 2) {
    240     print STDERR "+++++++++++ buildproc_mode: $build_proc_mode\n";
    241     print STDERR "+++++++++++ SQLPlug proc_mode: $proc_mode\n";
    242     }
    243 
    244     # TODO: where does reindexing take place, GreenstoneSQL -Plugout or -Plugin?
    245     #if($build_proc_mode =~ m/(delete|reindex)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add)
    246     if($build_proc_mode =~ m/(delete)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add)
    247    
    248           # build_proc_mode could be "(infodb|text)(delete|reindex)"
    249           # "...delete" or "...reindex" as per ArchivesInfPlugin
    250    
    251     print STDERR "@@@@ DELETING DOC FROM SQL DB\n";
    252    
     200    if($proc_mode ne "text_only") {
     201    $gs_sql->ensure_meta_table_exists();
     202    }
     203    if($proc_mode ne "meta_only") {
     204    $gs_sql->ensure_fulltxt_table_exists();
     205    }
     206}
     207
     208# This is called per document for docs that have been deleted from the
     209# collection. Most plugins will do nothing
     210# but if a plugin does any stuff outside of creating doc obj, then it may need
     211# to clear something.
     212# remove the doc(s) denoted by oids from GS SQL db
     213# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building
     214# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm
     215sub remove_one {
     216    my $self = shift (@_);
     217   
     218    my ($file, $oids, $archivedir) = @_;
     219
     220    print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n";
     221   
     222    #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS:
     223           # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED.
     224           # WE CARE ABOUT REMOVING THE DOCOID OF THAT IMAGE FILE FROM THE DB
     225
     226    print STDERR "*****************************\nAsked to remove_one oid\n***********************\n";
     227   
     228    my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc
     229    if(scalar @$oids > 1) {
     230    print STDERR "TODO: We now have reason to optimise GreenstoneSQLPlugin::remove_one() by using prepare and execute.\n";
     231    }
     232   
     233    my $proc_mode = $self->{'process_mode'};
     234    foreach my $oid (@$oids) { 
    253235    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
    254236        print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2;
     
    259241        $gs_sql->delete_recs_from_texttable_with_docid($oid);
    260242    }
    261    
    262     # If we're reindexing the current doc, we will we want to continue: which
    263     # will add this doc ID back into the db with the new meta/full txt values
    264     # But if we're deleting, then we're done processing the document, so set doc_oid to undef
    265     # to prevent adding it back into db
    266     #undef $self->{'doc_oid'} if($build_proc_mode =~ m/delete$/);   
    267    
    268     } # done deleting doc from SQL db   
    269    
    270     else {#if($self->{'doc_oid'}) { # if loading doc from SQL db
    271     print STDERR "@@@@ LOADING DOC FROM SQL DB\n"; 
    272    
    273     if($proc_mode eq "all" || $proc_mode eq "meta_only") {
    274         # read in meta for the collection (i.e. select * from <col>_metadata table
     243    }
     244    return 1;
     245}
     246
     247
     248sub xml_start_tag {
     249    my $self = shift(@_);
     250    my ($expat, $element) = @_;
     251
     252    my $outhandle = $self->{'outhandle'};
     253   
     254    $self->{'element'} = $element;
     255    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
     256    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
     257    # contains the tag)
     258
     259    # Don't access %_{'docoid'} directly: keep getting a warning message to
     260    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
     261    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
     262    my %attr_hash = %_; # right way, see OAIPlugin.pm
     263    $self->{'doc_oid'} = $attr_hash{'docoid'};
     264    ##print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n";
     265    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
     266        if $self->{'verbosity'} > 2;
     267
     268    }
     269    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
     270    $self->SUPER::xml_start_tag(@_);
     271    }
     272}
     273
     274# TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb?
     275
     276# We should only ever get here during the buildcol.pl phase
     277# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
     278# the doc_obj in memory is processed (indexed) and then made undef.
     279# So we have to work with doc_obj before superclass close_document() is finished.
     280sub close_document {
     281    my $self = shift(@_);
     282
     283    ##print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n";
     284   
     285    my $gs_sql = $self->{'gs_sql'};
     286   
     287    my $outhandle = $self->{'outhandle'};
     288    my $doc_obj = $self->{'doc_obj'};
     289
     290    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
     291    my $proc_mode = $self->{'process_mode'};
     292   
     293    # For now, we have access to doc_obj (until just before super::close_document() terminates)
     294   
     295    # no need to call $self->{'doc_obj'}->set_OID($oid);
     296    # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata
     297    # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata
     298    # Either way, Identifier meta will be read into the docobj automatically with other meta.
     299
     300    print STDERR "   GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n"
     301    if $self->{'verbosity'} > 0;
     302   
     303    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
     304    # read in meta for the collection (i.e. select * from <col>_metadata table
     305   
     306    my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
     307    print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
     308        if $self->{'verbosity'} > 2;
     309   
     310    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
     311    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
     312    while( my @row = $sth->fetchrow_array() ) {     
     313        #print $outhandle "row: @row\n";
     314        my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
    275315       
    276         my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
    277         print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n"
     316        # get rid of the artificial "root" introduced in section id when saving to sql db
     317        $sid =~ s@^root@@;
     318        $sid = $doc_obj->get_top_section() unless $sid;
     319        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
    278320        if $self->{'verbosity'} > 2;
    279321       
    280         print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
    281         # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
    282         while( my @row = $sth->fetchrow_array() ) {     
    283         #print $outhandle "row: @row\n";
    284         my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
     322        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
     323        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
     324    }
     325    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
     326        if $self->{'verbosity'} > 2;
     327    }
     328   
     329    if($proc_mode eq "all" || $proc_mode eq "text_only") {
     330    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
     331   
     332    my $fulltxt_table = $gs_sql->get_fulltext_table_name();
     333   
     334   
     335    my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
     336    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2;
     337   
     338    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
     339        if $self->{'verbosity'} > 2;
     340    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
    285341       
    286         # get rid of the artificial "root" introduced in section id when saving to sql db
    287         $sid =~ s@^root@@;
    288         $sid = $doc_obj->get_top_section() unless $sid;
    289         print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
    290             if $self->{'verbosity'} > 2;
    291        
    292         # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
    293         $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
    294         }
    295         print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
     342        # get rid of the artificial "root" introduced in section id when saving to sql db
     343        #$sid =~ s@^root@@;
     344        $sid = $doc_obj->get_top_section() if ($sid eq "root");
     345        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
    296346        if $self->{'verbosity'} > 2;
    297     }
    298    
    299     if($proc_mode eq "all" || $proc_mode eq "text_only") {
    300         # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
    301347       
    302         my $fulltxt_table = $gs_sql->get_fulltext_table_name();
    303        
    304    
    305         my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
    306         print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2;
    307        
    308         print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
    309         if $self->{'verbosity'} > 2;
    310         while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {       
    311        
    312         # get rid of the artificial "root" introduced in section id when saving to sql db
    313         #$sid =~ s@^root@@;
    314         $sid = $doc_obj->get_top_section() if ($sid eq "root");
    315         print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
    316             if $self->{'verbosity'} > 2;
    317        
    318         # TODO - pass by ref?
    319         # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
    320         $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
    321         }   
    322         print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
    323         if $self->{'verbosity'} > 2;
    324     }
    325    
    326     } # done reading into docobj from SQL db
     348        # TODO - pass by ref?
     349        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
     350        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
     351    }   
     352    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
     353        if $self->{'verbosity'} > 2;
     354    }
     355   
     356    # done reading into docobj from SQL db
    327357   
    328358    # don't forget to clean up on close() in superclass
     
    332362
    333363
    334 # We want SQLPlugin to connect to db only during buildcol.pl phase, not during import.pl
    335 # This works out okay, as close_document() (called by read()) is only invoked during buildcol.pl
    336 #
    337 # Further, we want a single db connection for the GS SQL Plugin to be used for
    338 # the multiple plugin passes: for "dummy" pass, and for doc level and for section level indexing
    339 # By calling the lazy loading get_sql_instance() from close_document(),
    340 # we connect to the SQL database once per GSSQLPlugin and only during the buildcol phase.
    341 #
    342 # get_gssql_instance() is a lazy loading method that returns singleton db connection for a GreenstoneSQLPlugin object. ("Code pattern" get instance vs singleton.)
    343 # One instance of db connection that can be used for all the many doc_objects processed by this plugin
    344 #
    345 # Except in methods get_gssql_instance() and deinit(), don't access self->{'_gs_sql'} directly.
    346 # Instead, call method get_gssql_instance() and store return value in a local variable, my $gs_sql
    347 #
    348 sub get_gssql_instance
    349 {   
    350     my $self = shift(@_);
    351 
    352     # if we failed to successfully connect once before, don't bother attempting to connect again
    353     #return undef if(defined $self->{'failed'}); # plugin/process would have terminated with die()
    354                                   # if we couldn't succeed connecting on any connection attempt
    355    
    356     return $self->{'_gs_sql'} if($self->{'_gs_sql'});
    357 
    358     # assume we'll fail to connect
    359     $self->{'failed'} = 1;
    360 
    361     print STDERR "@@@@@@@@@@ LAZY CONNECT CALLED\n";
    362    
     364# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl.
     365# During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
     366# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db.
     367
     368# GS SQL Plugin::init() (and deinit()) is called by import.pl and also by buildcol.pl
     369# This means it connects and deconnects during import.pl as well. This is okay
     370# as removeold, which should drop the collection tables, happens during the import phase
     371# and therefore also requires a db connection.
     372# TODO: Eventually can try moving get_gssql_instance into gssql.pm? That way both GS SQL Plugin
     373# and Plugout would be using one connection during import.pl phase when both plugs exist.
     374
     375# Call init() not begin() because there can be multiple plugin passes and begin() called for
     376# each pass (one for doc level and another for section level indexing), whereas init() should
     377# be called before any and all passes.
     378# This way, we can connect to the SQL database once per buildcol run.
     379sub init {
     380    my ($self) = shift (@_);
     381    ##print STDERR "@@@@@@@@@@ INIT CALLED\n";
     382   
     383    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
     384
    363385    ####################
    364386#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
     
    393415   
    394416    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
    395     #my $build_mode = $self->{'build_mode'} || "removeold";
    396 
    397     # the db and its tables should exist. Attempt to use the db:
     417
     418    # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet)
     419    # Bail if we can't use the database
    398420    if(!$gs_sql->use_db($db_name)) {
    399421   
     
    404426    die("Could not use db $db_name. Can't proceed.\n");
    405427    }
    406 
    407     #undef $self->{'failed'};
     428   
    408429   
    409430    # store db handle now that we're connected
    410     $self->{'_gs_sql'} = $gs_sql;
    411     return $gs_sql;
    412    
    413 }
     431    $self->{'gs_sql'} = $gs_sql;   
     432}
     433
    414434
    415435# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
     
    421441    my ($self) = shift (@_);
    422442   
    423     print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n";
    424    
    425     if($self->{'_gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have
     443    ##print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n";
     444   
     445    if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have
    426446    # a value except during buildcol, so when processor =~ m/buildproc$/.
    427     $self->{'_gs_sql'}->disconnect_from_db()
     447    $self->{'gs_sql'}->disconnect_from_db()
    428448        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
    429449
    430     # explicitly set to undef so all future use has to make the connection again
    431     undef $self->{'_gs_sql'};
     450    # explicitly delete gs_sql key (setting key to undef has a different meaning from deleting)
     451    # so all future use has to make the connection again
     452    delete $self->{'gs_sql'};
    432453    }
    433454
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r32555 r32563  
    347347}
    348348
    349 # GreenstoneSQLPlugout needs to know whether we're doing removeold or not
    350 sub set_incremental_options {
    351     my $self= shift (@_);
    352     my ($removeold, $keepold, $incremental, $incremental_mode) = @_;
    353    
    354     $self->{'removeold'}        = $removeold;
    355     $self->{'keepold'}          = $keepold;
    356     $self->{'incremental'}      = $incremental;
    357     $self->{'incremental_mode'} = $incremental_mode;
    358 }
    359349
    360350# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned"
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32560 r32563  
    5353# It's fine: the die() stmts all take place before setting up the super class' begin
    5454
    55 # TODO Q: about build_mode: how to detect removeold
     55# TODO Q: about build_mode: how to detect removeold. Now handled by
     56#   GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin.
    5657# TODO: deal with -removeold and everything? Or type out instructions for user
    5758
     
    117118        return bless $self, $class;
    118119    }
    119     print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
     120    #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
    120121   
    121122    return bless $self, $class;
     
    163164   
    164165    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
    165     my $build_mode = ($self->{'removeold'}) ? "removeold" : "incremental";
    166     print STDERR "@@@@@@@@@@@@ remove_old: $build_mode\n";
    167    
    168     if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) {
    169    
     166    my $proc_mode = $self->{'process_mode'};
     167   
     168
     169    my $success = $gs_sql->use_db($db_name);
     170   
     171    if($success && $proc_mode ne "text_only") {
     172    ##print STDERR "@@@@ Ensuring meta table exists\n";
     173    $success = $gs_sql->ensure_meta_table_exists();
     174    }   
     175    if($success && $proc_mode ne "meta_only") {
     176    ##print STDERR "@@@@ Ensuring fulltxt table exists\n";
     177    $success = $gs_sql->ensure_fulltxt_table_exists();
     178    }
     179    #if(!$gs_sql->load_db_and_tables($db_name, $proc_mode)) {
     180    if(!$success) {
    170181    # This is fatal for the plugout, let's terminate here after disconnecting again
    171182    # PrintError would already have displayed the warning message on load fail
     
    177188    # prepare the shared/common HANDLES to SQL insert statements that contain placeholders
    178189    # and which we will reuse repeatedly when actually executing the insert statements
    179     my $proc_mode = $self->{'process_mode'};
     190   
    180191    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
    181192    $self->{'metadata_prepared_insert_statement_handle'} = $gs_sql->prepare_insert_metadata_row_stmthandle();
     
    188199    $self->{'gs_sql'} = $gs_sql;
    189200   
    190     print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n";
    191     print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n";
     201    ##print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n";
     202    ##print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n";
    192203   
    193204    # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
Note: See TracChangeset for help on using the changeset viewer.