Changeset 32595

Show
Ignore:
Timestamp:
09.11.2018 22:33:51 (5 weeks ago)
Author:
ak19
Message:

Major tidying up: last remaining debug statements, lots of comments, removed TODO lists.

Location:
main/trunk/greenstone2/perllib
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gsmysql.pm

    r32594 r32595  
    4141################# 
    4242 
    43 ############################## 
    44  
    45 # TODO Q: If disconnect is automatically called when object destroyed, what does that mean 
    46 # for our file-global handle object, is disconnect only called at end of perl process? 
    47 # Does that mean we don't need to explicitly call disconnect in gsmysql object's destroy during 
    48 # the GLOBAL destruction phase? 
    49 # https://perldoc.perl.org/perlobj.html#Destructors 
    50  
    51 #+ TODO: add infrastructure for db_port, AutoCommit etc 
    52 # For port, see https://stackoverflow.com/questions/2248665/perl-script-to-connect-to-mysql-server-port-3307 
    53  
    54 # + TODO: remove unnecessary warn() since PrintError is active 
    55  
    56 # + TODO: drop table if exists and create table if exists are available in MySQL. Use those cmds 
    57 # instead of always first checking for existence ourselves? Only when subclassing to specific 
    58 # mysql class? 
    59  
    60  
    61 # + TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)? 
    62 # Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current?  
    63 # https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data 
    64 # BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying? 
    65 # The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back. 
    66 # Unless they do a full rebuild, which will recreate the table from scratch? 
    67 # SOLUTION-> rollback transaction on error, see https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 
    68 # In that case, should set AutoCommit to off on connection, and remember to commit at end. 
    69  
    70 # + TODO: Consider AutoCommit status (and Autocommit off allowing commit or rollback for GS coll build cancel) later 
    71  
    72  
    73 ############################## 
    7443 
    7544# singleton connection 
     
    7746my $ref_count = 0; 
    7847 
    79 # Need params_map keys: 
    80 # - collection_name 
    81 # - db_encoding (db content encoding) - MySQL can set this at server, db, table levels. For MySQL 
    82 # we set the enc during connect at server level. Not sure whether other DB's support it at the 
    83 # same levels. 
    84  
    85 # For connection to MySQL, need: 
    86 #  - db_driver, db_client_user, db_client_pwd, db_host, (db_port not used at present) 
    87 # So these will be parameterised, but in a hashmap, for just the connect method. 
    88  
    89 # Parameterise (one or more methods may use them): 
    90 # - db_name (which is the GS3 sitename, or "greenstone2" for GS2) 
    91  
    92 # Database access related functions 
    93 # http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm 
    94 # https://www.guru99.com/insert-into.html 
    9548 
    9649# Add signal handlers to cleanup and disconnect from db on sudden termination, incl cancel build 
     
    11669        &gsprintf::gsprintf(STDERR, "{gsmysql.restore_backups_on_build_cancel_msg}\n"); 
    11770        print STDERR "****************************\n"; 
     71 
     72        # TODO? 
     73        # Since we'll be disconnecting (cancel -> die() -> dbi::DESTROY() -> dbi::disconnect()), 
     74        # ensure $sth->finish() called on statement handles if any fetch remnants remain 
    11875    } 
    11976    } 
     
    12279} 
    12380 
     81 
     82# Need to pass in to constructor for creating member variables: 
     83# - collection_name 
     84# - verbosity 
     85 
     86# For connection to MySQL, need: 
     87#  - db_driver, db_client_user, db_client_pwd, db_host, (db_port not used at present) 
     88# So these will be parameterised, but in a hashmap, for just the connect() method. 
     89 
     90# Parameterise (one or more methods may use them): 
     91# - db_name (which is the GS3 sitename, or "greenstone2" for GS2) 
     92 
     93# Don't need to parameterise the db_encoding (db content encoding) 
     94# This is for now an internal variable, as the Greenstone db contents are always going to be utf8 
     95# reflecting how their doc.xml counterparts should only contain utf8. 
     96#  
     97# - MySQL can set the desired db_encoding at server, db, table levels. 
     98# Not sure whether other DBs support it at the same levels. 
     99# For MySQL we set the enc during connect at server level. 
     100# 
    124101sub new 
    125102 
     
    128105    my ($params_map) = @_; 
    129106     
    130     # library_url: to be specified on the cmdline if not using a GS-included web server 
    131     # the GSDL_LIBRARY_URL env var is useful when running cmdline buildcol.pl in the linux package manager versions of GS3 
    132107     
    133108    # https://stackoverflow.com/questions/7083453/copying-a-hashref-in-perl  
     
    162137# For more on when destroy is called, see https://www.perlmonks.org/?node_id=1020920 
    163138# 
     139# However, database is automatically disconnected on DBI DESTROY method called by perl on 
     140# a perl process' termination: 
     141# 
    164142# https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#disconnect 
    165 # "Disconnects the database from the database handle. disconnect is typically only used before exiting the program. The handle is of little use after disconnecting. 
     143# 'Disconnects the database from the database handle. disconnect is typically only used before exiting the program. The handle is of little use after disconnecting. 
    166144# 
    167145# The transaction behaviour of the disconnect method is, sadly, undefined. Some database systems (such as Oracle and Ingres) will automatically commit any outstanding changes, but others (such as Informix) will rollback any outstanding changes. Applications not using AutoCommit should explicitly call commit or rollback before calling disconnect. 
     
    171149# Generally, if you want your changes to be committed or rolled back when you disconnect, then you should explicitly call "commit" or "rollback" before disconnecting. 
    172150# 
    173 # If you disconnect from a database while you still have active statement handles (e.g., SELECT statement handles that may have more data to fetch), you will get a warning. The warning may indicate that a fetch loop terminated early, perhaps due to an uncaught error. To avoid the warning call the finish method on the active handles." 
     151# If you disconnect from a database while you still have active statement handles (e.g., SELECT statement handles that may have more data to fetch), you will get a warning. The warning may indicate that a fetch loop terminated early, perhaps due to an uncaught error. To avoid the warning call the finish method on the active handles.' 
    174152# 
    175153# 
     
    193171        # will ensure disconnection still happens. It happens, but silently. 
    194172        print STDERR "   Global Destruct Phase: DBI's own destructor will disconnect database\n"; 
     173 
     174        # When we did the disconnection manually on premature termination: 
    195175        #$_dbh_instance->disconnect or warn $_dbh_instance->errstr; 
    196176        #$_dbh_instance = undef; 
     
    211191################### BASIC DB OPERATIONS ################## 
    212192 
    213 # THE NEW DB FUNCTIONS 
    214193# NOTE: FULLTEXT is a reserved keyword in (My)SQL. So we can't name a table or any of its columns "fulltext". 
    215194# https://dev.mysql.com/doc/refman/5.5/en/keywords.html 
     
    239218 
    240219# SINGLETON METHOD # 
    241 # TODO: where should the defaults for these params be, here or in GS-SQLPlugin/Plugout? 
    242220sub _get_connection_instance 
    243221{ 
     
    247225     
    248226    return $_dbh_instance if($_dbh_instance); 
    249     # or make the connection 
     227    # or else make the connection, as happens below 
    250228 
    251229 
     
    300278    # More: https://www.oreilly.com/library/view/programming-the-perl/1565926994/re44.html 
    301279    my $autocommit = (defined $params_map->{'autocommit'}) ? $params_map->{'autocommit'} : 1; 
     280 
     281    # Useful: https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 
    302282     
    303283    my $dbh = DBI->connect("$connect_str", $db_user, $db_pwd, 
     
    307287                   RaiseError => 0, # off by default, but being explicit 
    308288                   AutoCommit => $autocommit, 
    309                    mysql_enable_utf8mb4 => 1 # tells MySQL to use UTF-8 for communication and tells DBD::mysql to decode the data, see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug  
     289                   mysql_enable_utf8mb4 => 1 # tells MySQL to use (4 byte) UTF-8 for 
     290                   # communication and tells DBD::mysql to use it to decode the data,  
     291                   # see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug  
    310292               }); 
    311293 
     
    328310    # To set up the db for utf8mb4, therefore,  
    329311    # the MySQL server needs to be configured for that char encoding by running the server as: 
    330     # mysql-5.7.23-linux-glibc2.12-x86_64/bin>./mysqld_safe --datadir=/Scratch/ak19/mysql/data --character_set_server=utf8mb4 
     312    # mysql/mysql-5.7.23-linux-glibc2.12-x86_64/bin>./mysqld_safe --datadir=/PATHTO/mysql/data --character_set_server=utf8mb4 
    331313    # AND when connecting to the server, we can can either set mysql_enable_utf8mb4 => 1 
    332314    # as a connection option 
     
    337319    # is no more objectionable. It has the advantage of cutting out the 2 extra lines of doing 
    338320    # set NAMES '<enc>' and $dbh->{mysql_enable_utf8mb4} = 1 here. 
    339     # These lines may be preferred if more db_driver options are to be supported in future: 
    340     # then a separate method called set_db_encoding($enc) can work out what db_driver we're using 
    341     # and if mysql and enc=utfy, then it can do the following whereas it will issue other do stmts 
    342     # for other db_drivers, see https://www.perlmonks.org/?node_id=259456: 
     321    # These lines may be preferred if more db_driver options are to be supported in future? 
     322    # (see https://www.perlmonks.org/?node_id=259456) 
    343323     
    344324    #my $stmt = "set NAMES '" . $db_enc . "'"; 
     
    364344    if($ref_count == 0) { # Only commit transaction when we're about to actually disconnect, not before 
    365345     
    366     # + TODO: If AutoCommit was off, meaning transactions were on/enabled, 
     346    # If AutoCommit was off, meaning transactions were on/enabled, 
    367347    # then here is where we commit our one long transaction. 
    368348    # https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#commit 
     
    383363# Call this method on die(), so that you're sure the perl process has disconnected from SQL db 
    384364# Disconnect from db - https://metacpan.org/pod/DBI#disconnect 
    385 # TODO: make sure to have committed or rolled back before disconnect 
    386 # and that you've call finish() on statement handles if any fetch remnants remain 
     365# Make sure to have committed or rolled back before disconnect 
    387366sub _force_disconnect_from_db { 
    388367    my $self= shift (@_); 
     
    403382 
    404383 
    405 # Load the designated database, i.e. 'use <dbname>;'. 
     384# Loads the designated database, i.e. 'use <dbname>;'. 
    406385# If the database doesn't yet exist, creates it and loads it. 
    407386# (Don't create the collection's tables yet, though) 
     
    590569# 'The q{...} style quoting used in this example avoids clashing with quotes that may be used in the SQL statement. Use the double-quote like qq{...} operator if you want to interpolate variables into the string. See "Quote and Quote-like Operators" in perlop for more details.' 
    591570# 
    592 # This method uses lazy loading to prepare the SQL insert stmt once for a table and store it, 
    593 # then execute the (stored) statement each time it's needed for that table. 
     571# Each insert method uses lazy loading to prepare the SQL insert stmts once for a table and 
     572# store it, then executes the (stored) statement each time it's needed for that table. 
    594573sub insert_row_into_metadata_table { 
    595574    my $self = shift (@_); 
     
    599578     
    600579    my $tablename = $self->get_metadata_table_name(); 
    601     my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)});# || warn("Could not prepare insert statement for metadata table\n"); 
     580    my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)}); 
    602581 
    603582    # Now we're ready to execute the command, unless we're only debugging 
     
    626605     
    627606    my $tablename = $self->get_fulltext_table_name(); 
    628     my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)});# || warn("Could not prepare insert statement for fulltxt table\n"); 
     607    my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)}); 
    629608     
    630609    # Now we're ready to execute the command, unless we're only debugging 
     
    651630#   https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#selectall_arrayref 
    652631# 
    653 # Returns the statement handle that prepared and executed 
     632# Returns the resulting records of preparing and executing 
    654633# a "SELECT * FROM <COLL>_metadata WHERE did = $oid" SQL statement. 
    655 # Caller can call fetchrow_array() on returned statement handle, $sth 
    656634# Have to use prepare() and execute() instead of do() since do() does 
    657635# not allow for fetching result set thereafter: 
     
    682660 
    683661# See select_from_metatable_matching_docid() above. 
    684 # Returns the statement handle that prepared and executed 
    685 # a "SELECT * FROM <COLL>_metadata WHERE did = $oid" SQL statement. 
    686 # Caller can call fetchrow_array() on returned statement handle, $sth 
     662# Returns the resulting records from preparing and executing 
     663# a "SELECT * FROM <COLL>_fulltxt WHERE did = $oid" SQL statement. 
    687664sub select_from_texttable_matching_docid { 
    688665    my $self= shift (@_); 
     
    720697} 
    721698 
    722 # delete all records in metatable with specified docid 
     699# delete all records in fulltxt table with the specified docid 
    723700sub delete_recs_from_texttable_with_docid { 
    724701    my $self= shift (@_); 
     
    758735 
    759736# Attempt to make sure the name parameter (for db or table name) is acceptable syntax 
    760 # for the db in question, e.g. for mysql. For example, (My)SQL doesn't like tables or 
     737# for the db in question, e.g. for mysql. For example, MySQL doesn't like tables or 
    761738# databases with '-' (hyphens) in their names 
    762739sub sanitize_name { 
     
    768745 
    769746 
     747# MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS, using that. 
     748# See https://www.perlmonks.org/bare/?node=DBI%20Recipes 
     749#    The page further has a table_exists function that could work with proper comparison 
     750# Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though 
    770751# I can get my version of table_exists to work, but it's not so ideal 
    771 # Interesting that MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS,  
    772 # see https://www.perlmonks.org/bare/?node=DBI%20Recipes 
    773 #    The page further has a table_exists function that could work with proper comparison 
    774 # TODO Q: Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though 
    775752sub table_exists { 
    776753    my $self = shift (@_); 
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32592 r32595  
    3838 
    3939 
    40 # TODO: 
    41 # - Run TODOs here, in Plugout and in gsmysql.pm by Dr Bainbridge. 
    42 # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that  
    43 # back in from the sql db while the remainder is to be read back in from the docsql .xml files. 
    44  
    45 # + TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking: basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 
    46 # Discuss the plugin/plugout parameters. 
    47  
    48 # TODO, test on windows and mac. 
    49 # Note: if parsing fails (e.g. using wrong plugout like GS XML plugout, which chokes on args intended for SQL plugout) then SQL plugin init would have already been called and done connection, but disconnect would not have been done because SQL plugin disconnect would not have been called upon parse failure. 
    50  
    51 # DONE: 
    52 # + TODO: For on cancel, add a SIGTERM handler or so to call end() 
    53 # or to explicitly call gs_sql->close_connection if $gs_sql def 
    54 # 
    55 # + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental 
    56 # (instead of tossing away db on every build) 
    57 # + Ask about docsql naming convention adopted to identify OID. Better way? 
    58 # collection names -> table names: it seems hyphens not allowed. Changed to underscores. 
    59 # + Startup parameters (except removeold/build_mode) 
    60 # + how do we detect we're to do removeold during plugout in import.pl phase 
    61 # + incremental building: where do we need to add code to delete rows from our sql table after 
    62 # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? 
    63 # + Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes 
    64 # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order. 
    65 # YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column 
    66 # + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt? 
    67 #   NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not  
    68 # running. The GLI "paralyzing" incident last time was not because of the gs sql connection code, 
    69 # but because my computer was freezing on-and-off. 
    70 # + "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in? 
    71 # Test doc with meta and text like macron in Maori text. 
    72 # + TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side 
    73 # effect that if the db doesn't exist, gsmysql::use_db() fails, as it won't create db. 
    74 #   This got fixed when GSSQLPlugin stopped connecting on init(). 
    75 # 
    76 # 
    77 #+ TODO: deal with incremental vs removeold. If docs removed from import folder, then import step 
    78 # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is. 
    79 # 
    80 # + TODO Q: is "reindex" = del from db + add to db? 
    81 # - is this okay for reindexing, or will it need to modify existing values (update table) 
    82 # - if it's okay, what does reindex need to accomplish (and how) if the OID changes because hash id produced is different? 
    83 # - delete is accomplished in GS SQL Plugin, during buildcol.pl. When should reindexing take place? 
    84 # during SQL plugout/import.pl or during plugin? If adding is done by GSSQLPlugout, does it need to 
    85 # be reimplemented in GSSQLPlugin to support the adding portion of reindexing. 
    86 # 
    87 # INCREMENTAL REBUILDING IMPLEMENTED CORRECTLY AND WORKS: 
    88 # Overriding plugins' remove_all() method covered removeold. 
    89 # Overriding plugins' remove_one() method is all I needed to do for reindex and deletion 
    90 # (incremental and non-incremental) to work. 
    91 # but doing all this needed an overhaul of gsmysql.pm and its use by the GS SQL plugin and plugout. 
    92 # - needed to correct plugin.pm::remove_some() to process all files 
    93 # - and needed to correct GreenstoneSQLPlugin::close_document() to setOID() after all 
    94 # All incremental import and buildcol worked after that: 
    95 # - deleting files and running incr-import and incr-buildcol (= "incr delete"), 
    96 # - deleting files and running incr-import and buildcol (="non-incr delete") 
    97 # - modifying meta and doing an incr rebuild 
    98 # - modifying fulltext and doing an incr rebuild 
    99 # - renaming a file forces a reindex: doc is removed from db and added back in, due to remove_one() 
    100 # - tested CSV file: adding some records, changing some records 
    101 #    + CSVPlugin test (collection csvsql) 
    102 #    + MetadataCSVPlugin test (modified collection sqltest to have metadata.csv refer to the 
    103 #      filenames of sqltest's documents) 
    104 #    + shared image test (collection shareimg): if 2 html files reference the same image, the docs 
    105 #      are indeed both reindexed if the image is modified (e.g. I replaced the image with another 
    106 #      of the same name) which in the GS SQL plugin/plugout case is that the 2 docs are deleted 
    107 #      and added in again. 
    10840 
    10941######################################################################################## 
    11042 
    111 # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext 
     43# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext 
    11244# is still written out to doc.xml (docsql .xml), that will be processed as usual, 
    11345# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever 
     
    14577        'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ]; 
    14678 
    147 # TODO: If subclassing gsmysql for other supporting databases and if they have different required 
     79# NOTE: If subclassing gsmysql for other supporting databases and if they have different required 
    14880# connection parameters, we can check how WordPlugin, upon detecting Word is installed, 
    14981# dynamically loads Word specific configuration options. 
     
    225157} 
    226158 
    227 # GS SQL Plugin::init() (and deinit()) is called by import.pl and also by buildcol.pl 
    228 # This means it connects and deconnects during import.pl as well. This is okay 
    229 # as removeold, which should drop the collection tables, happens during the import phase, 
    230 # calling GreenstoneSQLPlugin::and therefore also requires a db connection. 
    231 # + TODO: Eventually can try moving get_gssql_instance into gsmysql.pm? That way both GS SQL Plugin 
    232 # and Plugout would be using one connection during import.pl phase when both plugs exist. 
    233  
    234159# Call init() not begin() because there can be multiple plugin passes and begin() called for 
    235160# each pass (one for doc level and another for section level indexing), whereas init() should 
    236161# be called before any and all passes. 
    237162# This way, we can connect to the SQL database once per buildcol run. 
     163# Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a 
     164# singleton db connection, regardless of the number of gsmysql objects instantiated and 
     165# the number of connect() calls made on them. 
    238166sub init { 
    239167    my ($self) = shift (@_); 
    240     ##print STDERR "@@@@@@@@@@ INIT CALLED\n"; 
    241168     
    242169    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init(). 
    243170 
    244     #################### 
    245 #    print "@@@ SITE NAME: ". $self->{'site'} . "\n" if defined $self->{'site'}; 
    246 #    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n"; 
    247  
    248 #    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n"; 
    249 #    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n"; 
    250 #    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n"; 
    251 #    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; 
    252     #################### 
    253171 
    254172    # create gsmysql object. 
     
    287205    # PrintError would already have displayed the warning message on load fail 
    288206    # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db 
    289     #$gs_sql->force_disconnect_from_db(); 
    290207    die("Could not use db $db_name. Can't proceed.\n"); 
    291208    } 
     
    297214 
    298215 
    299 # This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol 
     216# This method also runs on import.pl if gs_sql has a value. 
    300217# Call deinit() not end() because there can be multiple plugin passes: 
    301218# one for doc level and another for section level indexing 
    302219# and deinit() should be called before all passes 
    303220# This way, we can close the SQL database once per buildcol run. 
     221# Again, this doesn't matter because we gsmysql the ensures the connection 
     222# is a singleton connection instance, which connects once and disconnects once per perl process. 
    304223sub deinit { 
    305224    my ($self) = shift (@_); 
    306      
    307     ##print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n"; 
    308225     
    309226    if($self->{'gs_sql'}) { 
     
    360277# but if a plugin does any stuff outside of creating doc obj, then it may need 
    361278# to clear something. 
    362 # remove the doc(s) denoted by oids from GS SQL db 
     279# In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db. 
    363280# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building 
    364 # incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm 
     281# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm. 
     282# As well as cases involving reindexing, which are implemented here as delete followed by add. 
    365283sub remove_one { 
    366284    my $self = shift (@_); 
     
    379297     
    380298    my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc 
    381  
    382     print STDERR "*****************************\nAsked to remove_one oid\n***********************\n"; 
    383     print STDERR "Num oids: " . scalar (@$oids) . "\n"; 
    384299     
    385300    my $proc_mode = $self->{'process_mode'}; 
     
    431346# Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory 
    432347 
    433 # We should only ever get here during the buildcol.pl phase 
     348# We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase. 
    434349# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method, 
    435350# the doc_obj in memory is processed (indexed) and then made undef. 
     
    438353    my $self = shift(@_); 
    439354 
    440     ##print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n"; 
    441      
    442355    my $gs_sql = $self->{'gs_sql'}; 
    443356     
     
    468381     
    469382    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2; 
    470     # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/ 
    471383 
    472384    foreach my $row (@$records) { 
    473         #print $outhandle "row: @$row\n"; 
    474385        my ($primary_key, $did, $sid, $metaname, $metaval) = @$row; 
    475386         
     
    480391        if $self->{'verbosity'} > 2; 
    481392         
    482         # + TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly: 
    483         #$doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval)); 
    484          
    485         # data stored unescaped in db: escaping only for html/xml files, not for txt files or db 
     393        # We're only dealing with utf8 data where docobj is concerned 
     394        # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db 
    486395        $doc_obj->add_utf8_metadata($sid, $metaname, $metaval); 
    487396    } 
     
    510419        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n" 
    511420        if $self->{'verbosity'} > 2; 
    512          
    513         # TODO - pass by ref? 
    514         # + TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly: 
    515         # data stored unescaped in db: escaping is only for html/xml files, not for txt files or db 
    516         #my $textref = &docprint::unescape_textref(\$text); 
     421 
     422        # We're only dealing with utf8 data where docobj is concerned 
     423        # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db 
    517424        $doc_obj->add_utf8_textref($sid, \$text); 
    518425    }    
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32592 r32595  
    3939 
    4040 
    41 # + TODO: SIGTERM rollback and disconnect? 
    42 # + TODO Q: what about verbosity for debugging, instead of current situation of printing out upon debug set at the expense of writing to db 
    43 #+ TODO Q: introduced site param to plugins and plugouts. Did I do it right? And should they have hiddengli = "yes". No longer a param 
    44 # !!!! Did I do the pass by ref in docprint's escape and unescape textref functions correctly, and how they're called here? 
    45 #   Any more optimisation I can do around this? 
    46  
    47 # this plugout does not output the metadata and/or fulltxt xml to a file, 
    48 # but outputs rows into a mysql table for metadata and/or a table for fulltxt 
     41# This plugout does not output the metadata and/or fulltxt xml to a file, 
     42# but outputs rows into a MySQL db table for metadata and/or a db table for fulltxt 
     43 
    4944sub BEGIN { 
    5045    @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout'); 
    5146} 
    5247 
    53 # + NOTTODO: die() statements need to be replaced with premature_termination 
    54 # which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW 
    55 # It's fine: the die() stmts all take place before setting up the super class' begin 
    56  
    57 # + TODO Q: about build_mode: how to detect removeold. Now handled by  
    58 #   GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin. 
    59 # + TODO: deal with -removeold and everything? Or type out instructions for user 
    60  
    61 # + TODO Q: what is "group" in GreenstoneXMLPlugout? 
    6248 
    6349my $process_mode_list = 
     
    141127        return bless $self, $class; 
    142128    } 
    143     #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n"; 
    144129     
    145130    return bless $self, $class; 
     
    151136    my $self= shift (@_); 
    152137     
    153     # The saveas.options 
    154     #print STDERR "@@@@ PLUGOUT db_pwd: " . $self->{'db_client_pwd'} . "\n"; 
    155     #print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n"; 
    156     #print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n"; 
    157     #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n"; 
    158     #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n"; 
    159  
    160     ############ LOAD NECESSARY OPTIONS ########### 
    161     #print "@@@ plugout SITE NAME: ". $self->{'site'} . "\n" if defined $self->{'site'}; 
    162     #print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n"; 
    163  
    164     print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n"; 
    165  
    166138    my $db_params = { 
    167139    'collection_name' => $ENV{'GSDLCOLLECTION'}, 
     
    192164    } 
    193165 
    194     #die("@@@@ TEST. Connected successfully. Testing gsmysql::destructor.\n"); # WORKS 
    195      
    196166    my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2 
    197167    my $proc_mode = $self->{'process_mode'}; 
     
    201171     
    202172    if($success && $proc_mode ne "text_only") { 
    203     ##print STDERR "@@@@ Ensuring meta table exists\n"; 
    204173    $success = $gs_sql->ensure_meta_table_exists(); 
    205174    }     
    206175    if($success && $proc_mode ne "meta_only") { 
    207     ##print STDERR "@@@@ Ensuring fulltxt table exists\n"; 
    208176    $success = $gs_sql->ensure_fulltxt_table_exists(); 
    209177    } 
     
    213181    # PrintError would already have displayed the warning message on load fail 
    214182    # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db 
    215     #$gs_sql->force_disconnect_from_db(); # disconnect_from_db() will issue a warning on error 
    216183    die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n"); 
    217184    } 
     
    221188     
    222189     
    223     # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here: 
    224     # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin() 
    225     # finally, can call begin on super - important as doc.xml is opened as a group etc 
     190    # If setting up connection to sql db failed, we'd have terminated and wouldn't come up to here 
     191    # and wouldn't have bothered preparing GreenstoneXMLPlugout by calling superclass' begin(). 
     192    # Finally, can call begin on super - important as doc.xml is opened as a group etc 
    226193     
    227194    $self->SUPER::begin(@_); 
     
    261228} 
    262229  
    263 # + X TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed? 
    264 # That's only for indexing, not for this step which only generates the content in archives dir 
     230# saveas() only generates the content in archives dir and in the SQL database 
    265231sub saveas { 
    266232    my $self = shift (@_); 
     
    276242    $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug 
    277243 
    278     # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed 
    279     
    280      
    281244    # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files, 
    282245    # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db     
     
    308271     
    309272     
    310     # database connection is closed in end() method 
    311     # so we don't open and close over and over for each doc during a single build 
     273    # database connection is closed once, in end() method 
     274    # We're not opening and closing over and over for each doc during a single build 
    312275} 
    313276 
     
    324287} 
    325288 
    326 # Perl: Reading or Writing to Another Program 
    327 # https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm 
    328289sub recursive_write_meta_and_text { 
    329290    my $self = shift (@_); 
     
    345306    foreach my $data (@{$section_ptr->{'metadata'}}) { 
    346307        my $meta_name = $data->[0]; 
    347         # TODO: does it need to be stored escaped, as it requires unescaping when read back in 
    348         # from db (unlike for reading back in from doc.xml) 
    349308 
    350309        # Treat db like a text file instead of an html/xml file: don't need to escape text 
    351         # going into it 
    352         #my $escaped_meta_value = &docprint::escape_text($data->[1]); 
     310        # going into it, unlike with doc(sql).xml 
    353311        my $meta_value = $data->[1]; 
    354312         
     
    359317        # filling in the values 
    360318        # OR if debugging, then it will print the SQL insert statement but not execute it 
     319        # (a behaviour following what the GS XML Plugout superclass does on debug) 
    361320         
    362321        $gs_sql->insert_row_into_metadata_table($doc_oid, $section_name, $meta_name, $meta_value, $self->{'debug'}); 
     
    368327 
    369328    # See above, no need to html-escape for db 
    370     my $section_text = $section_ptr->{'text'}; #&docprint::escape_textref(\$section_ptr->{'text'}); 
     329    my $section_text = $section_ptr->{'text'}; 
    371330     
    372     # fulltxt column can be SQL NULL. undef value gets written out as NULL: 
     331    # fulltxt column can be SQL NULL. undef value for $section_text gets written out as NULL: 
    373332    # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string 
    374333    # The following will do the SQL insertion