Changeset 32563

Show
Ignore:
Timestamp:
02.11.2018 19:07:16 (2 weeks ago)
Author:
ak19
Message:

1. Overhaul of GreenstoneSQLPlugs to handle removeold and incremental delete correctly. And now code also automatically handles 'non-incremental delete' (see mention in ArchivesInfPlugin?). The new version no longer does lazy loading for getting the sql db connection in the GS SQL Plugin, as now the connection needs to be active since the start of the plugin to run SQL delete statements on remove_old. So the db connection code for the GS SQL plugin has moved back into its init() method. Lots of changes to gssql.pm (and some flow on effects to the GS SQL Plugout) as when database tables exist and need to be created have changed. 2. Undoing most of the changes of changeset 32555 since we're doing incremental delete and removeold differently and in the correct way now when using the GreenstoneSQLPlugs.

Location:
main/trunk/greenstone2/perllib
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gssql.pm

    r32561 r32563  
    4444 
    4545# Parameterise (one or more methods may use them): 
    46 # - build_mode (like removeold) 
    4746# - db_name (which is the GS3 sitename) 
    4847 
     
    134133    print STDERR "Away to make connection to $db_driver database with:\n"; 
    135134    print STDERR " - hostname $db_host; username: $db_user"; 
    136     print STDERR "; and the password provided" if $db_pwd; 
     135    print STDERR "; and the provided password" if $db_pwd; 
    137136    print STDERR "\nAssuming the mysql server has been started with: --character_set_server=utf8mb4\n" if $db_driver eq "mysql"; 
    138137    } 
     
    188187} 
    189188 
    190 # will attempt to load the specified db and the <coll>_metadata and <coll>_fulltxt for this 
    191 # collection, or create any of these (db, tables) that don't yet exist. At the end 
    192 # it will have loaded the requested database (in MySQL: "use <db>;") 
    193 sub load_db_and_tables { 
    194     my $self= shift (@_); 
    195     my ($db_name, $build_mode) = @_; 
     189# Load the designated database, i.e. 'use <dbname>;'. 
     190# If the database doesn't yet exist, creates it and loads it. 
     191# (Don't create the collection's tables yet, though) 
     192# At the end it will have loaded the requested database (in MySQL: "use <db>;") on success. 
     193# As usual, returns success or failure value that can be evaluated in a boolean context. 
     194sub use_db { 
     195    my $self= shift (@_); 
     196    my ($db_name) = @_; 
    196197    my $dbh = $self->{'db_handle'}; 
    197198    $db_name = $self->sanitize_name($db_name); 
     
    215216    # once more attempt to use db, now that it exists 
    216217    $dbh->do("use $db_name") || return 0; 
    217     #$dbh->do("use localsite") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n"; 
    218  
    219     # attempt to create tables in current db 
    220     $self->create_metadata_table() || return 0; 
    221     $self->create_fulltext_table() || return 0;  
     218    #$dbh->do("use $db_name") or die "Error (code" . $dbh->err ."): " . $dbh->errstr . "\n"; 
    222219 
    223220    $success = 1; 
     
    227224 
    228225    print STDERR "@@@ DATABASE $db_name EXISTED\n" if($self->{'verbosity'} > 2); 
    229      
    230  
    231     # build_mode can be removeold or incremental. We only do something special on removeold: 
    232     # deleting the existing tables for this collection and recreating empty ones 
    233     if($build_mode eq "removeold") { 
    234         print STDERR "   Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'}); 
    235         $self->delete_collection_tables(); 
    236     } 
    237  
    238     # use existing tables if any 
    239     # attempt to create tables in current db     
    240     if($build_mode eq "removeold" || !$self->table_exists($self->get_metadata_table_name())) { 
    241         $self->create_metadata_table() || return 0; 
    242     } else { 
    243         print STDERR "@@@ Meta table exists\n" if($self->{'verbosity'} > 2); 
    244     } 
    245     if($build_mode eq "removeold" || !$self->table_exists($self->get_fulltext_table_name())) { 
    246         $self->create_fulltext_table() || return 0; 
    247     } else { 
    248         print STDERR "@@@ Fulltxt table exists\n" if($self->{'verbosity'} > 2); 
    249     } 
    250      
    251226    } 
    252227     
     
    254229} 
    255230 
    256 # GreenstoneSQLPlugin calls this method to load an existing db. 
    257 # This will terminate if the db does not exist. Unlike load_db_and_tables() above, used by 
    258 # GreenstoneSQLPlugout, this method will not attempt to create the requested db (nor its tables) 
    259 # TODO: GS SQLPlugin is called before GS SQLPlugout and attempts to use_db() - called in plugin's 
    260 # init() method. This will fail if the db does not exist. Ideally want the gssqlplugin only called 
    261 # during buildcol.pl 
    262 sub use_db { 
    263     my $self= shift (@_); 
    264     my ($db_name) = @_; 
    265     my $dbh = $self->{'db_handle'}; 
    266     $db_name = $self->sanitize_name($db_name); 
    267  
    268      
    269     print STDERR "Loading database $db_name\n" if($self->{'verbosity'} > 1); 
    270      
    271     # perl DBI switch database: https://www.perlmonks.org/?node_id=995434 
    272     # do() returns undef on error. 
    273     # connection succeeded, try to load our database. If that didn't work, attempt to create db 
    274     return $dbh->do("use $db_name") || warn(); 
     231# We should already have done "use <database>;" if this gets called. 
     232# Just load this collection's metatable 
     233sub ensure_meta_table_exists { 
     234    my $self = shift (@_); 
     235     
     236    my $tablename = $self->get_metadata_table_name(); 
     237    if(!$self->table_exists($tablename)) { 
     238    #print STDERR "   Creating metadata table $tablename\n" if($self->{'verbosity'} > 1); 
     239    $self->create_metadata_table() || return 0; 
     240    } else { 
     241    print STDERR "@@@ Meta table exists\n" if($self->{'verbosity'} > 2); 
     242    } 
     243    return 1; 
     244} 
     245 
     246# We should already have done "use <database>;" if this gets called. 
     247# Just load this collection's metatable 
     248sub ensure_fulltxt_table_exists { 
     249    my $self = shift (@_); 
     250     
     251    my $tablename = $self->get_fulltext_table_name();     
     252    if(!$self->table_exists($tablename)) { 
     253    #print STDERR "   Creating fulltxt table $tablename\n" if($self->{'verbosity'} > 1); 
     254    $self->create_fulltext_table() || return 0; 
     255    } else { 
     256    print STDERR "@@@ Fulltxt table exists\n" if($self->{'verbosity'} > 2); 
     257    } 
     258    return 1; 
    275259} 
    276260 
  • main/trunk/greenstone2/perllib/inexport.pm

    r32555 r32563  
    690690    } 
    691691    my $processor = &plugout::load_plugout($plugout); 
    692     $processor->set_incremental_options($removeold, $keepold, $incremental, $incremental_mode); 
    693692    $processor->setoutputdir ($archivedir); 
    694693    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; 
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32560 r32563  
    4040# TODO: 
    4141# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge. 
    42 # - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? 
    43 # Test doc with meta and text like macron in Maori text. 
    4442# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that  
    4543# back in from the sql db while the remainder is to be read back in from the docsql .xml files. 
     
    4745# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step 
    4846# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is 
    49  
    50 # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 
    51 # Discuss the plugin/plugout parameters. 
    52  
    53 # + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? 
    54 # NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not  
    55 # running. The GLI "paralyzing" incident last time was not because of the gs sql connection code, 
    56 # but because my computer was freezing on-and-off. 
    5747 
    5848# TODO Q: is "reindex" = del from db + add to db? 
     
    6353# be reimplemented in GSSQLPlugin to support the adding portion of reindexing. 
    6454 
    65  
    66 # TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 
    67 # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. 
    68  
    69  
     55# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking: basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 
     56# Discuss the plugin/plugout parameters. 
     57 
     58 
     59# DONE: 
    7060# + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental 
    7161# (instead of tossing away db on every build) 
     
    7969# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order. 
    8070# YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column 
     71# + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? 
     72#   NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not  
     73# running. The GLI "paralyzing" incident last time was not because of the gs sql connection code, 
     74# but because my computer was freezing on-and-off. 
     75# + "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? 
     76# Test doc with meta and text like macron in Maori text. 
     77# + TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 
     78# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. 
     79#   This got fixed when GSSQLPlugin stopped connecting on init(). 
    8180 
    8281 
     
    177176 
    178177    # do anything else that needs to be done here when not pluginfo 
    179     #$self->{'delete_docids'} = (); # list of doc oids to delete during deinit() 
    180178     
    181179    return $self; 
    182180} 
    183181 
    184 sub xml_start_tag { 
    185     my $self = shift(@_); 
    186     my ($expat, $element) = @_; 
    187  
    188     my $outhandle = $self->{'outhandle'}; 
    189      
    190     $self->{'element'} = $element; 
    191     if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element 
    192     # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ 
    193     # contains the tag) 
    194  
    195     # Don't access %_{'docoid'} directly: keep getting a warning message to  
    196     # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap  
    197     # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. 
    198     my %attr_hash = %_; # right way, see OAIPlugin.pm 
    199     $self->{'doc_oid'} = $attr_hash{'docoid'}; 
    200     print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n"; 
    201     print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" 
    202         if $self->{'verbosity'} > 2; 
    203  
    204     } 
    205     else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements 
    206     $self->SUPER::xml_start_tag(@_); 
    207     } 
    208 } 
    209  
    210 # TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb? 
    211  
    212 # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, 
    213 # the doc_obj in memory is processed (indexed) and then made undef. 
    214 # So we have to work with doc_obj before superclass close_document() is finished. 
    215 sub close_document { 
    216     my $self = shift(@_); 
    217  
    218     print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n"; 
    219      
    220     my $gs_sql = $self->get_gssql_instance();  
    221      
    222     my $outhandle = $self->{'outhandle'}; 
    223     my $doc_obj = $self->{'doc_obj'}; 
    224  
    225     my $build_proc_mode = $self->{'processor'}->get_mode(); # can be "text" as per basebuildproc or infodb 
    226     my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() 
     182# This is called once if removeold is set with import.pl. Most plugins will do 
     183# nothing but if a plugin does any stuff outside of creating doc obj, then  
     184# it may need to clear something. 
     185# In the case of GreenstoneSQL plugs: this is the first time we have a chance 
     186# to purge the tables of the current collection from the current site's database 
     187sub remove_all { 
     188    my $self = shift (@_); 
     189    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_; 
     190 
     191    print STDERR "   Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'}); 
     192     
     193    # if we're in here, we'd already have run 'use database <site_name>;' during sub init() 
     194    # so we can go ahead and delete the collection's tables 
     195    my $gs_sql = $self->{'gs_sql'}; 
     196    $gs_sql->delete_collection_tables(); # will delete them if they exist 
     197 
     198    # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin() 
    227199    my $proc_mode = $self->{'process_mode'}; 
    228      
    229     print $outhandle "++++ OID of document (meta|text) to be del or read in from DB: ".$self->{'doc_oid'}."\n" 
    230     if $self->{'verbosity'} > 2; 
    231      
    232     # For now, we have access to doc_obj (until just before super::close_document() terminates) 
    233      
    234     # no need to call $self->{'doc_obj'}->set_OID($oid); 
    235     # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata 
    236     # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata 
    237     # Either way, Identifier meta will be read into the docobj automatically with other meta. 
    238      
    239     if ($self->{'verbosity'} > 2) { 
    240     print STDERR "+++++++++++ buildproc_mode: $build_proc_mode\n"; 
    241     print STDERR "+++++++++++ SQLPlug proc_mode: $proc_mode\n"; 
    242     } 
    243  
    244     # TODO: where does reindexing take place, GreenstoneSQL -Plugout or -Plugin? 
    245     #if($build_proc_mode =~ m/(delete|reindex)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add) 
    246     if($build_proc_mode =~ m/(delete)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add) 
    247      
    248           # build_proc_mode could be "(infodb|text)(delete|reindex)" 
    249           # "...delete" or "...reindex" as per ArchivesInfPlugin 
    250      
    251     print STDERR "@@@@ DELETING DOC FROM SQL DB\n"; 
    252      
     200    if($proc_mode ne "text_only") { 
     201    $gs_sql->ensure_meta_table_exists(); 
     202    } 
     203    if($proc_mode ne "meta_only") { 
     204    $gs_sql->ensure_fulltxt_table_exists(); 
     205    } 
     206} 
     207 
     208# This is called per document for docs that have been deleted from the  
     209# collection. Most plugins will do nothing 
     210# but if a plugin does any stuff outside of creating doc obj, then it may need 
     211# to clear something. 
     212# remove the doc(s) denoted by oids from GS SQL db 
     213# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building 
     214# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm 
     215sub remove_one { 
     216    my $self = shift (@_); 
     217     
     218    my ($file, $oids, $archivedir) = @_; 
     219 
     220    print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n"; 
     221     
     222    #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS:  
     223           # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED. 
     224           # WE CARE ABOUT REMOVING THE DOCOID OF THAT IMAGE FILE FROM THE DB 
     225 
     226    print STDERR "*****************************\nAsked to remove_one oid\n***********************\n"; 
     227     
     228    my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc 
     229    if(scalar @$oids > 1) { 
     230    print STDERR "TODO: We now have reason to optimise GreenstoneSQLPlugin::remove_one() by using prepare and execute.\n"; 
     231    } 
     232     
     233    my $proc_mode = $self->{'process_mode'}; 
     234    foreach my $oid (@$oids) {   
    253235    if($proc_mode eq "all" || $proc_mode eq "meta_only") { 
    254236        print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2; 
     
    259241        $gs_sql->delete_recs_from_texttable_with_docid($oid); 
    260242    } 
    261      
    262     # If we're reindexing the current doc, we will we want to continue: which 
    263     # will add this doc ID back into the db with the new meta/full txt values 
    264     # But if we're deleting, then we're done processing the document, so set doc_oid to undef 
    265     # to prevent adding it back into db 
    266     #undef $self->{'doc_oid'} if($build_proc_mode =~ m/delete$/);    
    267      
    268     } # done deleting doc from SQL db     
    269      
    270     else {#if($self->{'doc_oid'}) { # if loading doc from SQL db 
    271     print STDERR "@@@@ LOADING DOC FROM SQL DB\n";   
    272      
    273     if($proc_mode eq "all" || $proc_mode eq "meta_only") { 
    274         # read in meta for the collection (i.e. select * from <col>_metadata table 
     243    } 
     244    return 1; 
     245} 
     246 
     247 
     248sub xml_start_tag { 
     249    my $self = shift(@_); 
     250    my ($expat, $element) = @_; 
     251 
     252    my $outhandle = $self->{'outhandle'}; 
     253     
     254    $self->{'element'} = $element; 
     255    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element 
     256    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ 
     257    # contains the tag) 
     258 
     259    # Don't access %_{'docoid'} directly: keep getting a warning message to  
     260    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap  
     261    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. 
     262    my %attr_hash = %_; # right way, see OAIPlugin.pm 
     263    $self->{'doc_oid'} = $attr_hash{'docoid'}; 
     264    ##print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n"; 
     265    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" 
     266        if $self->{'verbosity'} > 2; 
     267 
     268    } 
     269    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements 
     270    $self->SUPER::xml_start_tag(@_); 
     271    } 
     272} 
     273 
     274# TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb? 
     275 
     276# We should only ever get here during the buildcol.pl phase 
     277# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, 
     278# the doc_obj in memory is processed (indexed) and then made undef. 
     279# So we have to work with doc_obj before superclass close_document() is finished. 
     280sub close_document { 
     281    my $self = shift(@_); 
     282 
     283    ##print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n"; 
     284     
     285    my $gs_sql = $self->{'gs_sql'}; 
     286     
     287    my $outhandle = $self->{'outhandle'}; 
     288    my $doc_obj = $self->{'doc_obj'}; 
     289 
     290    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() 
     291    my $proc_mode = $self->{'process_mode'}; 
     292     
     293    # For now, we have access to doc_obj (until just before super::close_document() terminates) 
     294     
     295    # no need to call $self->{'doc_obj'}->set_OID($oid); 
     296    # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata 
     297    # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata 
     298    # Either way, Identifier meta will be read into the docobj automatically with other meta. 
     299 
     300    print STDERR "   GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n" 
     301    if $self->{'verbosity'} > 0; 
     302     
     303    if($proc_mode eq "all" || $proc_mode eq "meta_only") { 
     304    # read in meta for the collection (i.e. select * from <col>_metadata table 
     305     
     306    my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
     307    print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n" 
     308        if $self->{'verbosity'} > 2; 
     309     
     310    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; 
     311    # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 
     312    while( my @row = $sth->fetchrow_array() ) {      
     313        #print $outhandle "row: @row\n"; 
     314        my ($primary_key, $did, $sid, $metaname, $metaval) = @row; 
    275315         
    276         my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 
    277         print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n" 
     316        # get rid of the artificial "root" introduced in section id when saving to sql db 
     317        $sid =~ s@^root@@; 
     318        $sid = $doc_obj->get_top_section() unless $sid; 
     319        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" 
    278320        if $self->{'verbosity'} > 2; 
    279321         
    280         print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; 
    281         # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 
    282         while( my @row = $sth->fetchrow_array() ) {      
    283         #print $outhandle "row: @row\n"; 
    284         my ($primary_key, $did, $sid, $metaname, $metaval) = @row; 
     322        # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: 
     323        $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); 
     324    } 
     325    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 
     326        if $self->{'verbosity'} > 2; 
     327    } 
     328     
     329    if($proc_mode eq "all" || $proc_mode eq "text_only") { 
     330    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table 
     331     
     332    my $fulltxt_table = $gs_sql->get_fulltext_table_name(); 
     333     
     334     
     335    my $sth = $gs_sql->select_from_texttable_matching_docid($oid); 
     336    print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; 
     337     
     338    print $outhandle "----------\nSQL DB contains txt entries for-----------\n" 
     339        if $self->{'verbosity'} > 2; 
     340    while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {        
    285341         
    286         # get rid of the artificial "root" introduced in section id when saving to sql db 
    287         $sid =~ s@^root@@; 
    288         $sid = $doc_obj->get_top_section() unless $sid; 
    289         print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" 
    290             if $self->{'verbosity'} > 2; 
    291          
    292         # TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: 
    293         $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); 
    294         } 
    295         print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 
     342        # get rid of the artificial "root" introduced in section id when saving to sql db 
     343        #$sid =~ s@^root@@; 
     344        $sid = $doc_obj->get_top_section() if ($sid eq "root"); 
     345        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n" 
    296346        if $self->{'verbosity'} > 2; 
    297     } 
    298      
    299     if($proc_mode eq "all" || $proc_mode eq "text_only") { 
    300         # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table 
    301347         
    302         my $fulltxt_table = $gs_sql->get_fulltext_table_name(); 
    303          
    304      
    305         my $sth = $gs_sql->select_from_texttable_matching_docid($oid); 
    306         print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; 
    307          
    308         print $outhandle "----------\nSQL DB contains txt entries for-----------\n" 
    309         if $self->{'verbosity'} > 2; 
    310         while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {        
    311          
    312         # get rid of the artificial "root" introduced in section id when saving to sql db 
    313         #$sid =~ s@^root@@; 
    314         $sid = $doc_obj->get_top_section() if ($sid eq "root"); 
    315         print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n" 
    316             if $self->{'verbosity'} > 2; 
    317          
    318         # TODO - pass by ref? 
    319         # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: 
    320         $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); 
    321         }    
    322         print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" 
    323         if $self->{'verbosity'} > 2; 
    324     } 
    325      
    326     } # done reading into docobj from SQL db 
     348        # TODO - pass by ref? 
     349        # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: 
     350        $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); 
     351    }    
     352    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" 
     353        if $self->{'verbosity'} > 2; 
     354    } 
     355     
     356    # done reading into docobj from SQL db 
    327357     
    328358    # don't forget to clean up on close() in superclass 
     
    332362 
    333363 
    334 # We want SQLPlugin to connect to db only during buildcol.pl phase, not during import.pl 
    335 # This works out okay, as close_document() (called by read()) is only invoked during buildcol.pl 
    336 # 
    337 # Further, we want a single db connection for the GS SQL Plugin to be used for  
    338 # the multiple plugin passes: for "dummy" pass, and for doc level and for section level indexing 
    339 # By calling the lazy loading get_sql_instance() from close_document(), 
    340 # we connect to the SQL database once per GSSQLPlugin and only during the buildcol phase. 
    341 # 
    342 # get_gssql_instance() is a lazy loading method that returns singleton db connection for a GreenstoneSQLPlugin object. ("Code pattern" get instance vs singleton.) 
    343 # One instance of db connection that can be used for all the many doc_objects processed by this plugin 
    344 # 
    345 # Except in methods get_gssql_instance() and deinit(), don't access self->{'_gs_sql'} directly. 
    346 # Instead, call method get_gssql_instance() and store return value in a local variable, my $gs_sql 
    347 # 
    348 sub get_gssql_instance 
    349 {     
    350     my $self = shift(@_); 
    351  
    352     # if we failed to successfully connect once before, don't bother attempting to connect again 
    353     #return undef if(defined $self->{'failed'}); # plugin/process would have terminated with die() 
    354                                   # if we couldn't succeed connecting on any connection attempt 
    355      
    356     return $self->{'_gs_sql'} if($self->{'_gs_sql'}); 
    357  
    358     # assume we'll fail to connect 
    359     $self->{'failed'} = 1; 
    360  
    361     print STDERR "@@@@@@@@@@ LAZY CONNECT CALLED\n"; 
    362      
     364# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl. 
     365# During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 
     366# effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. 
     367 
     368# GS SQL Plugin::init() (and deinit()) is called by import.pl and also by buildcol.pl 
     369# This means it connects and deconnects during import.pl as well. This is okay 
     370# as removeold, which should drop the collection tables, happens during the import phase 
     371# and therefore also requires a db connection. 
     372# TODO: Eventually can try moving get_gssql_instance into gssql.pm? That way both GS SQL Plugin 
     373# and Plugout would be using one connection during import.pl phase when both plugs exist. 
     374 
     375# Call init() not begin() because there can be multiple plugin passes and begin() called for 
     376# each pass (one for doc level and another for section level indexing), whereas init() should 
     377# be called before any and all passes. 
     378# This way, we can connect to the SQL database once per buildcol run. 
     379sub init { 
     380    my ($self) = shift (@_); 
     381    ##print STDERR "@@@@@@@@@@ INIT CALLED\n"; 
     382     
     383    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). 
     384 
    363385    #################### 
    364386#    print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'}; 
     
    393415     
    394416    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 
    395     #my $build_mode = $self->{'build_mode'} || "removeold"; 
    396  
    397     # the db and its tables should exist. Attempt to use the db: 
     417 
     418    # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet) 
     419    # Bail if we can't use the database 
    398420    if(!$gs_sql->use_db($db_name)) { 
    399421     
     
    404426    die("Could not use db $db_name. Can't proceed.\n"); 
    405427    } 
    406  
    407     #undef $self->{'failed'}; 
     428     
    408429     
    409430    # store db handle now that we're connected 
    410     $self->{'_gs_sql'} = $gs_sql; 
    411     return $gs_sql; 
    412      
    413 } 
     431    $self->{'gs_sql'} = $gs_sql;     
     432} 
     433 
    414434 
    415435# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol 
     
    421441    my ($self) = shift (@_); 
    422442     
    423     print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; 
    424      
    425     if($self->{'_gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have 
     443    ##print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; 
     444     
     445    if($self->{'gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have 
    426446    # a value except during buildcol, so when processor =~ m/buildproc$/. 
    427     $self->{'_gs_sql'}->disconnect_from_db() 
     447    $self->{'gs_sql'}->disconnect_from_db() 
    428448        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); 
    429449 
    430     # explicitly set to undef so all future use has to make the connection again 
    431     undef $self->{'_gs_sql'}; 
     450    # explicitly delete gs_sql key (setting key to undef has a different meaning from deleting) 
     451    # so all future use has to make the connection again 
     452    delete $self->{'gs_sql'}; 
    432453    } 
    433454 
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r32555 r32563  
    347347} 
    348348 
    349 # GreenstoneSQLPlugout needs to know whether we're doing removeold or not 
    350 sub set_incremental_options { 
    351     my $self= shift (@_); 
    352     my ($removeold, $keepold, $incremental, $incremental_mode) = @_; 
    353      
    354     $self->{'removeold'}        = $removeold; 
    355     $self->{'keepold'}          = $keepold; 
    356     $self->{'incremental'}      = $incremental; 
    357     $self->{'incremental_mode'} = $incremental_mode; 
    358 } 
    359349 
    360350# OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned" 
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32560 r32563  
    5353# It's fine: the die() stmts all take place before setting up the super class' begin 
    5454 
    55 # TODO Q: about build_mode: how to detect removeold 
     55# TODO Q: about build_mode: how to detect removeold. Now handled by  
     56#   GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin. 
    5657# TODO: deal with -removeold and everything? Or type out instructions for user 
    5758 
     
    117118        return bless $self, $class; 
    118119    } 
    119     print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n"; 
     120    #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n"; 
    120121     
    121122    return bless $self, $class; 
     
    163164     
    164165    my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 
    165     my $build_mode = ($self->{'removeold'}) ? "removeold" : "incremental"; 
    166     print STDERR "@@@@@@@@@@@@ remove_old: $build_mode\n"; 
    167      
    168     if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) { 
    169      
     166    my $proc_mode = $self->{'process_mode'}; 
     167     
     168 
     169    my $success = $gs_sql->use_db($db_name); 
     170     
     171    if($success && $proc_mode ne "text_only") { 
     172    ##print STDERR "@@@@ Ensuring meta table exists\n"; 
     173    $success = $gs_sql->ensure_meta_table_exists(); 
     174    }     
     175    if($success && $proc_mode ne "meta_only") { 
     176    ##print STDERR "@@@@ Ensuring fulltxt table exists\n"; 
     177    $success = $gs_sql->ensure_fulltxt_table_exists(); 
     178    } 
     179    #if(!$gs_sql->load_db_and_tables($db_name, $proc_mode)) { 
     180    if(!$success) {  
    170181    # This is fatal for the plugout, let's terminate here after disconnecting again 
    171182    # PrintError would already have displayed the warning message on load fail 
     
    177188    # prepare the shared/common HANDLES to SQL insert statements that contain placeholders 
    178189    # and which we will reuse repeatedly when actually executing the insert statements 
    179     my $proc_mode = $self->{'process_mode'}; 
     190     
    180191    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) { 
    181192    $self->{'metadata_prepared_insert_statement_handle'} = $gs_sql->prepare_insert_metadata_row_stmthandle(); 
     
    188199    $self->{'gs_sql'} = $gs_sql; 
    189200     
    190     print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n"; 
    191     print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n"; 
     201    ##print STDERR "#### Meta stmt: " . $self->{'metadata_prepared_insert_statement_handle'}->{'Statement'} . "\n"; 
     202    ##print STDERR "#### Full stmt: " . $self->{'fulltxt_prepared_insert_statement_handle'}->{'Statement'} . "\n"; 
    192203     
    193204    # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here: