- Timestamp:
- 2018-10-30T19:29:56+13:00 (5 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/gssql.pm
r32544 r32555 186 186 187 187 188 # build_mode can be removeold or incremental. We only do something special on removeold: 189 # deleting the existing tables for this collection and recreating empty ones 188 190 if($build_mode eq "removeold") { 189 191 $self->delete_collection_tables(); -
main/trunk/greenstone2/perllib/inexport.pm
r32540 r32555 690 690 } 691 691 my $processor = &plugout::load_plugout($plugout); 692 $processor->set_incremental_options($removeold, $keepold, $incremental, $incremental_mode); 692 693 $processor->setoutputdir ($archivedir); 693 694 $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; -
main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm
r32544 r32555 40 40 # TODO: 41 41 # - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge. 42 # - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? 43 # Test doc with meta and text like macron in Maori text. 44 # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that 45 # back in from the sql db while the remainder is to be read back in from the docsql .xml files. 46 47 # TODO: deal with incremental vs removeold. If docs removed from import folder, then import step 48 # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is 49 50 # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 51 # Discuss the plugin/plugout parameters. 52 53 # TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? 54 55 # TODO Q: is "reindex" = del from db + add to db? 56 # - is this okay for reindexing, or will it need to modify existing values (update table) 57 # - if it's okay, what does reindex need to accomplish (and how) if the OID changes because hash id produced is different? 58 # - delete is accomplished in GS SQL Plugin, during buildcol.pl. When should reindexing take place? 59 # during SQL plugout/import.pl or during plugin? If adding is done by GSSQLPlugout, does it need to 60 # be reimplemented in GSSQLPlugin to support the adding portion of reindexing. 61 62 63 # TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 64 # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. 65 66 67 # + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental 68 # (instead of tossing away db on every build) 42 69 # + Ask about docsql naming convention adopted to identify OID. Better way? 43 70 # collection names -> table names: it seems hyphens not allowed. Changed to underscores. 44 71 # + Startup parameters (except removeold/build_mode) 45 # -how do we detect we're to do removeold during plugout in import.pl phase46 # -incremental building: where do we need to add code to delete rows from our sql table after72 # + how do we detect we're to do removeold during plugout in import.pl phase 73 # + incremental building: where do we need to add code to delete rows from our sql table after 47 74 # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? 48 # - "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? 49 # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that 50 # back in from the sql db while the remainder is to be read back in from the docsql .xml files. 51 # - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes 52 # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order 53 54 # TODO: deal with incremental vs removeold. If docs removed from import folder, then import step 55 # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is 56 57 # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 58 # Discuss the plugin/plugout parameters. 59 60 # TODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? 75 # + Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes 76 # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order. 77 # YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column 78 79 80 ######################################################################################## 61 81 62 82 # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext … … 179 199 # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. 180 200 my %attr_hash = %_; # right way, see OAIPlugin.pm 181 $self->{'doc_oid'} = $attr_hash{'docoid'}; 201 $self->{'doc_oid'} = $attr_hash{'docoid'}; 202 print STDERR "XXXXXXXXXXXXXX in SQLPlugin::xml_start_tag()\n"; 182 203 print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" 183 204 if $self->{'verbosity'} > 2; … … 189 210 } 190 211 191 # TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?212 # TODO Q: Why are there 4 passes when we're only indexing at doc and section level (2 passes)? What's the dummy pass, why is there a pass for infodb? 192 213 193 214 # At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, … … 196 217 sub close_document { 197 218 my $self = shift(@_); 219 220 print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n"; 221 222 my $gs_sql = $self->get_gssql_instance(); 198 223 199 224 my $outhandle = $self->{'outhandle'}; 200 225 my $doc_obj = $self->{'doc_obj'}; 201 # sub read() will make the db connection setting $self->{'gs_sql'} once: the first time read() 202 # is called on the GS SQLPlugin instance. 203 my $gs_sql = $self->{'gs_sql'} || return; # $self->lazy_get_gssql(); # won't want to call lazy_get_gssql() if close_doc called during (incr-)import.pl, only during buildcol.pl 204 205 # TODO: return statement skips "dummy" pass. Should we skip it or not? 206 # If we don't return, gs_sql is not set for dummy pass... 207 226 227 my $build_proc_mode = $self->{'processor'}->get_mode(); # can be "text" as per basebuildproc or infodb 208 228 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() 209 print $outhandle "++++ OID of document (meta|text) to be read in from DB: $oid\n" 229 my $proc_mode = $self->{'process_mode'}; 230 231 print $outhandle "++++ OID of document (meta|text) to be del or read in from DB: ".$self->{'doc_oid'}."\n" 210 232 if $self->{'verbosity'} > 2; 211 233 212 234 # For now, we have access to doc_obj (until just before super::close_document() terminates) 213 235 214 236 # no need to call $self->{'doc_obj'}->set_OID($oid); 215 237 # because either the OID is stored in the SQL db as meta 'Identifier' alongside other metadata 216 238 # or it's stored in the doc.xml as metadata 'Identifier' alongside other metadata 217 239 # Either way, Identifier meta will be read into the docobj automatically with other meta. 218 219 my $proc_mode = $self->{'process_mode'}; 220 if($proc_mode eq "all" || $proc_mode eq "meta_only") { 221 # read in meta for the collection (i.e. select * from <col>_metadata table 222 223 my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 224 print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n" 225 if $self->{'verbosity'} > 2; 226 227 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; 228 # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 229 while( my @row = $sth->fetchrow_array() ) { 230 #print $outhandle "row: @row\n"; 231 my ($primary_key, $did, $sid, $metaname, $metaval) = @row; 232 233 # get rid of the artificial "root" introduced in section id when saving to sql db 234 $sid =~ s@^root@@; 235 $sid = $doc_obj->get_top_section() unless $sid; 236 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" 240 241 if ($self->{'verbosity'} > 2) { 242 print STDERR "+++++++++++ buildproc_mode: $build_proc_mode\n"; 243 print STDERR "+++++++++++ SQLPlug proc_mode: $proc_mode\n"; 244 } 245 246 # TODO: where does reindexing take place, GreenstoneSQL -Plugout or -Plugin? 247 #if($build_proc_mode =~ m/(delete|reindex)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add) 248 if($build_proc_mode =~ m/(delete)$/) { # doc denoted by current OID has been marked for deletion or reindexing (=delete + add) 249 250 # build_proc_mode could be "(infodb|text)(delete|reindex)" 251 # "...delete" or "...reindex" as per ArchivesInfPlugin 252 253 print STDERR "@@@@ DELETING DOC FROM SQL DB\n"; 254 255 if($proc_mode eq "all" || $proc_mode eq "meta_only") { 256 print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2; 257 $gs_sql->delete_recs_from_metatable_with_docid($oid); 258 } 259 if($proc_mode eq "all" || $proc_mode eq "text_only") { 260 print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2; 261 $gs_sql->delete_recs_from_texttable_with_docid($oid); 262 } 263 264 # If we're reindexing the current doc, we will we want to continue: which 265 # will add this doc ID back into the db with the new meta/full txt values 266 # But if we're deleting, then we're done processing the document, so set doc_oid to undef 267 # to prevent adding it back into db 268 #undef $self->{'doc_oid'} if($build_proc_mode =~ m/delete$/); 269 270 } # done deleting doc from SQL db 271 272 else {#if($self->{'doc_oid'}) { # if loading doc from SQL db 273 print STDERR "@@@@ LOADING DOC FROM SQL DB\n"; 274 275 if($proc_mode eq "all" || $proc_mode eq "meta_only") { 276 # read in meta for the collection (i.e. select * from <col>_metadata table 277 278 my $sth = $gs_sql->select_from_metatable_matching_docid($oid); 279 print $outhandle "### SQL select stmt: ".$sth->{'Statement'}."\n" 237 280 if $self->{'verbosity'} > 2; 238 281 239 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: 240 $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); 282 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; 283 # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 284 while( my @row = $sth->fetchrow_array() ) { 285 #print $outhandle "row: @row\n"; 286 my ($primary_key, $did, $sid, $metaname, $metaval) = @row; 287 288 # get rid of the artificial "root" introduced in section id when saving to sql db 289 $sid =~ s@^root@@; 290 $sid = $doc_obj->get_top_section() unless $sid; 291 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n" 292 if $self->{'verbosity'} > 2; 293 294 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: 295 $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); 296 } 297 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 298 if $self->{'verbosity'} > 2; 241 299 } 242 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 243 if $self->{'verbosity'} > 2; 244 } 245 246 if($proc_mode eq "all" || $proc_mode eq "text_only") { 247 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table 248 249 my $fulltxt_table = $gs_sql->get_fulltext_table_name(); 250 251 252 my $sth = $gs_sql->select_from_texttable_matching_docid($oid); 253 print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; 254 255 print $outhandle "----------\nSQL DB contains txt entries for-----------\n" 256 if $self->{'verbosity'} > 2; 257 while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) { 258 259 # get rid of the artificial "root" introduced in section id when saving to sql db 260 #$sid =~ s@^root@@; 261 $sid = $doc_obj->get_top_section() if ($sid eq "root"); 262 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n" 300 301 if($proc_mode eq "all" || $proc_mode eq "text_only") { 302 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table 303 304 my $fulltxt_table = $gs_sql->get_fulltext_table_name(); 305 306 307 my $sth = $gs_sql->select_from_texttable_matching_docid($oid); 308 print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 2; 309 310 print $outhandle "----------\nSQL DB contains txt entries for-----------\n" 263 311 if $self->{'verbosity'} > 2; 264 265 # TODO - pass by ref? 266 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: 267 $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); 268 } 269 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" 270 if $self->{'verbosity'} > 2; 271 } 272 312 while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) { 313 314 # get rid of the artificial "root" introduced in section id when saving to sql db 315 #$sid =~ s@^root@@; 316 $sid = $doc_obj->get_top_section() if ($sid eq "root"); 317 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n" 318 if $self->{'verbosity'} > 2; 319 320 # TODO - pass by ref? 321 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: 322 $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text)); 323 } 324 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n" 325 if $self->{'verbosity'} > 2; 326 } 327 328 } # done reading into docobj from SQL db 273 329 274 330 # don't forget to clean up on close() in superclass … … 278 334 279 335 280 # TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl. 281 # During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 282 # effect that if the db doesn't exist, gssql::use_db() fails, as it won't create db. 283 # Lazy connection. 284 285 # Call init() not begin() because there can be multiple plugin passes 286 # and init() should be called before all passes: 287 # one for doc level and another for section level indexing 288 # This way, we can connect to the SQL database once per buildcol run. 289 #sub init { 290 # my ($self) = shift (@_); 291 # print STDERR "@@@@@@@@@@ INIT CALLED\n"; 292 293 # $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). 294 295 296 sub lazy_get_gssql { 336 # We want SQLPlugin to connect to db only during buildcol.pl phase, not during import.pl 337 # This works out okay, as close_document() (called by read()) is only invoked during buildcol.pl 338 # 339 # Further, we want a single db connection for the GS SQL Plugin to be used for 340 # the multiple plugin passes: for "dummy" pass, and for doc level and for section level indexing 341 # By calling the lazy loading get_sql_instance() from close_document(), 342 # we connect to the SQL database once per GSSQLPlugin and only during the buildcol phase. 343 # 344 # get_gssql_instance() is a lazy loading method that returns singleton db connection for a GreenstoneSQLPlugin object. ("Code pattern" get instance vs singleton.) 345 # One instance of db connection that can be used for all the many doc_objects processed by this plugin 346 # 347 # Except in methods get_gssql_instance() and deinit(), don't access self->{'_gs_sql'} directly. 348 # Instead, call method get_gssql_instance() and store return value in a local variable, my $gs_sql 349 # 350 sub get_gssql_instance 351 { 297 352 my $self = shift(@_); 298 353 … … 301 356 # if we couldn't succeed connecting on any connection attempt 302 357 303 return $self->{' gs_sql'} if($self->{'gs_sql'});358 return $self->{'_gs_sql'} if($self->{'_gs_sql'}); 304 359 305 360 # assume we'll fail to connect … … 355 410 356 411 # store db handle now that we're connected 357 $self->{' gs_sql'} = $gs_sql;412 $self->{'_gs_sql'} = $gs_sql; 358 413 return $gs_sql; 359 414 … … 370 425 print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; 371 426 372 if($self->{' gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have427 if($self->{'_gs_sql'}) { # only want to work with sql db if buildcol.pl, gs_sql won't have 373 428 # a value except during buildcol, so when processor =~ m/buildproc$/. 374 $self->{' gs_sql'}->disconnect_from_db()429 $self->{'_gs_sql'}->disconnect_from_db() 375 430 || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); 376 431 377 432 # explicitly set to undef so all future use has to make the connection again 378 undef $self->{' gs_sql'};433 undef $self->{'_gs_sql'}; 379 434 } 380 435 … … 382 437 } 383 438 384 # TODO: This can't work until GSSQLPlugout has implemented build_mode = incremental 385 # (instead of tossing away db on every build) 386 # then this method needs to undef $self->docid after deleting, and close_doc() has to 387 # just return if $self->docid undefined 388 389 sub read { 390 my $self = shift (@_); 391 392 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 393 394 my $rv = $self->SUPER::read(@_); # defined in ReadXMLFile inherited by superclass GS XML PLugin 395 396 if(defined $rv) { # undef if !can_proc_this_file, but -1 if failed to parse docsql.xml 397 398 # don't want to do any GreenstoneSQLPlugin DB stuff during import.pl 399 # only during in buildcol.pl 400 return if (ref($processor) !~ m/buildproc$/i); 401 402 # we know we're buildcol, let's proceed: 403 404 # make the connection once for the life of the plugin, not once for every doc 405 # so that we can disconnect at the very end of the plugin's life: on deinit() 406 # If we hadn't connected before, connect now 407 my $gs_sql = $self->{'gs_sql'} || $self->lazy_get_gssql(); # TODO which syntax best? 408 409 my $build_proc_mode = $processor->get_mode(); # can be "text" as per basebuildproc or 410 # "textdelete" or "textreindex" as per ArchivesInfPlugin 411 if($build_proc_mode =~ m/\.delete/) { 412 413 # NOTTODO: add current doc OID stored in $self->{'doc_oid'} to list of oids get rid 414 # of from table(s) entries. We'll do the actual deletion in deinit?? Since that's 415 # when ArchivesInfPlugin deletes the docsql.xml files 416 417 my $doc_oid = $self->{'doc_oid'}; 418 #my @delete_docids = $self->{'delete_docids'}; 419 #push (@delete_docids, $doc_oid); 420 421 my $proc_mode = $self->{'process_mode'}; 422 if($proc_mode eq "all" || $proc_mode eq "meta_only") { 423 print STDERR "@@@@@@@@ Deleting $doc_oid from meta table\n"; 424 $gs_sql->delete_recs_from_metatable_with_docid($doc_oid); 425 } 426 if($proc_mode eq "all" || $proc_mode eq "text_only") { 427 print STDERR "@@@@@@@@ Deleting $doc_oid from fulltxt table\n"; 428 $gs_sql->delete_recs_from_texttable_with_docid($doc_oid); 429 } 430 } 431 } 432 433 return $rv; 434 435 } 436 439 440 441 -
main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm
r32540 r32555 347 347 } 348 348 349 # GreenstoneSQLPlugout needs to know whether we're doing removeold or not 350 sub set_incremental_options { 351 my $self= shift (@_); 352 my ($removeold, $keepold, $incremental, $incremental_mode) = @_; 353 354 $self->{'removeold'} = $removeold; 355 $self->{'keepold'} = $keepold; 356 $self->{'incremental'} = $incremental; 357 $self->{'incremental_mode'} = $incremental_mode; 358 } 359 349 360 # OIDtype may be "hash" or "hash_on_full_filename" or "incremental" or "filename" or "dirname" or "full_filename" or "assigned" 350 361 sub set_OIDtype { -
main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm
r32543 r32555 139 139 #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; 140 140 #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n"; 141 142 ########### TODO: deal with build mode #########143 144 $self->{'build_mode'} = "removeold";145 141 146 142 ############ LOAD NECESSARY OPTIONS ########### … … 173 169 174 170 my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 175 my $build_mode = $self->{'build_mode'} || "removeold"; 171 my $build_mode = ($self->{'removeold'}) ? "removeold" : "incremental"; 172 print STDERR "@@@@@@@@@@@@ remove_old: $build_mode\n"; 173 176 174 if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) { 177 175
Note:
See TracChangeset
for help on using the changeset viewer.