Changeset 32595


Ignore:
Timestamp:
2018-11-09T22:33:51+13:00 (3 years ago)
Author:
ak19
Message:

Major tidying up: last remaining debug statements, lots of comments, removed TODO lists.

Location:
main/trunk/greenstone2/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gsmysql.pm

    r32594 r32595  
    4141#################
    4242
    43 ##############################
    44 
    45 # TODO Q: If disconnect is automatically called when object destroyed, what does that mean
    46 # for our file-global handle object, is disconnect only called at end of perl process?
    47 # Does that mean we don't need to explicitly call disconnect in gsmysql object's destroy during
    48 # the GLOBAL destruction phase?
    49 # https://perldoc.perl.org/perlobj.html#Destructors
    50 
    51 #+ TODO: add infrastructure for db_port, AutoCommit etc
    52 # For port, see https://stackoverflow.com/questions/2248665/perl-script-to-connect-to-mysql-server-port-3307
    53 
    54 # + TODO: remove unnecessary warn() since PrintError is active
    55 
    56 # + TODO: drop table if exists and create table if exists are available in MySQL. Use those cmds
    57 # instead of always first checking for existence ourselves? Only when subclassing to specific
    58 # mysql class?
    59 
    60 
    61 # + TODO Q: What on cancelling a build: delete table? But what if it was a rebuild and the rebuild is cancelled (not the original build)?
    62 # Do we create a copy of the orig database as backup, then start populating current db, and if cancelled, delete current db and RENAME backup table to current?
    63 # https://stackoverflow.com/questions/3280006/duplicating-a-mysql-table-indexes-and-data
    64 # BUT what if the table is HUGE? (Think of a collection with millions of docs.) Huge overhead in copying?
    65 # The alternative is we just quit on cancel, but then: cancel could leave the table in a partial committed state, with no way of rolling back.
    66 # Unless they do a full rebuild, which will recreate the table from scratch?
    67 # SOLUTION-> rollback transaction on error, see https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
    68 # In that case, should set AutoCommit to off on connection, and remember to commit at end.
    69 
    70 # + TODO: Consider AutoCommit status (and Autocommit off allowing commit or rollback for GS coll build cancel) later
    71 
    72 
    73 ##############################
    7443
    7544# singleton connection
     
    7746my $ref_count = 0;
    7847
    79 # Need params_map keys:
    80 # - collection_name
    81 # - db_encoding (db content encoding) - MySQL can set this at server, db, table levels. For MySQL
    82 # we set the enc during connect at server level. Not sure whether other DB's support it at the
    83 # same levels.
    84 
    85 # For connection to MySQL, need:
    86 #  - db_driver, db_client_user, db_client_pwd, db_host, (db_port not used at present)
    87 # So these will be parameterised, but in a hashmap, for just the connect method.
    88 
    89 # Parameterise (one or more methods may use them):
    90 # - db_name (which is the GS3 sitename, or "greenstone2" for GS2)
    91 
    92 # Database access related functions
    93 # http://g2pc1.bu.edu/~qzpeng/manual/MySQL%20Commands.htm
    94 # https://www.guru99.com/insert-into.html
    9548
    9649# Add signal handlers to cleanup and disconnect from db on sudden termination, incl cancel build
     
    11669        &gsprintf::gsprintf(STDERR, "{gsmysql.restore_backups_on_build_cancel_msg}\n");
    11770        print STDERR "****************************\n";
     71
     72        # TODO?
     73        # Since we'll be disconnecting (cancel -> die() -> dbi::DESTROY() -> dbi::disconnect()),
     74        # ensure $sth->finish() called on statement handles if any fetch remnants remain
    11875    }
    11976    }
     
    12279}
    12380
     81
     82# Need to pass in to constructor for creating member variables:
     83# - collection_name
     84# - verbosity
     85
     86# For connection to MySQL, need:
     87#  - db_driver, db_client_user, db_client_pwd, db_host, (db_port not used at present)
     88# So these will be parameterised, but in a hashmap, for just the connect() method.
     89
     90# Parameterise (one or more methods may use them):
     91# - db_name (which is the GS3 sitename, or "greenstone2" for GS2)
     92
     93# Don't need to parameterise the db_encoding (db content encoding)
     94# This is for now an internal variable, as the Greenstone db contents are always going to be utf8
     95# reflecting how their doc.xml counterparts should only contain utf8.
     96#
     97# - MySQL can set the desired db_encoding at server, db, table levels.
     98# Not sure whether other DBs support it at the same levels.
     99# For MySQL we set the enc during connect at server level.
     100#
    124101sub new
    125102
     
    128105    my ($params_map) = @_;
    129106   
    130     # library_url: to be specified on the cmdline if not using a GS-included web server
    131     # the GSDL_LIBRARY_URL env var is useful when running cmdline buildcol.pl in the linux package manager versions of GS3
    132107   
    133108    # https://stackoverflow.com/questions/7083453/copying-a-hashref-in-perl
     
    162137# For more on when destroy is called, see https://www.perlmonks.org/?node_id=1020920
    163138#
     139# However, database is automatically disconnected on DBI DESTROY method called by perl on
     140# a perl process' termination:
     141#
    164142# https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#disconnect
    165 # "Disconnects the database from the database handle. disconnect is typically only used before exiting the program. The handle is of little use after disconnecting.
     143# 'Disconnects the database from the database handle. disconnect is typically only used before exiting the program. The handle is of little use after disconnecting.
    166144#
    167145# The transaction behaviour of the disconnect method is, sadly, undefined. Some database systems (such as Oracle and Ingres) will automatically commit any outstanding changes, but others (such as Informix) will rollback any outstanding changes. Applications not using AutoCommit should explicitly call commit or rollback before calling disconnect.
     
    171149# Generally, if you want your changes to be committed or rolled back when you disconnect, then you should explicitly call "commit" or "rollback" before disconnecting.
    172150#
    173 # If you disconnect from a database while you still have active statement handles (e.g., SELECT statement handles that may have more data to fetch), you will get a warning. The warning may indicate that a fetch loop terminated early, perhaps due to an uncaught error. To avoid the warning call the finish method on the active handles."
     151# If you disconnect from a database while you still have active statement handles (e.g., SELECT statement handles that may have more data to fetch), you will get a warning. The warning may indicate that a fetch loop terminated early, perhaps due to an uncaught error. To avoid the warning call the finish method on the active handles.'
    174152#
    175153#
     
    193171        # will ensure disconnection still happens. It happens, but silently.
    194172        print STDERR "   Global Destruct Phase: DBI's own destructor will disconnect database\n";
     173
     174        # When we did the disconnection manually on premature termination:
    195175        #$_dbh_instance->disconnect or warn $_dbh_instance->errstr;
    196176        #$_dbh_instance = undef;
     
    211191################### BASIC DB OPERATIONS ##################
    212192
    213 # THE NEW DB FUNCTIONS
    214193# NOTE: FULLTEXT is a reserved keyword in (My)SQL. So we can't name a table or any of its columns "fulltext".
    215194# https://dev.mysql.com/doc/refman/5.5/en/keywords.html
     
    239218
    240219# SINGLETON METHOD #
    241 # TODO: where should the defaults for these params be, here or in GS-SQLPlugin/Plugout?
    242220sub _get_connection_instance
    243221{
     
    247225   
    248226    return $_dbh_instance if($_dbh_instance);
    249     # or make the connection
     227    # or else make the connection, as happens below
    250228
    251229
     
    300278    # More: https://www.oreilly.com/library/view/programming-the-perl/1565926994/re44.html
    301279    my $autocommit = (defined $params_map->{'autocommit'}) ? $params_map->{'autocommit'} : 1;
     280
     281    # Useful: https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
    302282   
    303283    my $dbh = DBI->connect("$connect_str", $db_user, $db_pwd,
     
    307287                   RaiseError => 0, # off by default, but being explicit
    308288                   AutoCommit => $autocommit,
    309                    mysql_enable_utf8mb4 => 1 # tells MySQL to use UTF-8 for communication and tells DBD::mysql to decode the data, see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug
     289                   mysql_enable_utf8mb4 => 1 # tells MySQL to use (4 byte) UTF-8 for
     290                   # communication and tells DBD::mysql to use it to decode the data,
     291                   # see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug
    310292               });
    311293
     
    328310    # To set up the db for utf8mb4, therefore,
    329311    # the MySQL server needs to be configured for that char encoding by running the server as:
    330     # mysql-5.7.23-linux-glibc2.12-x86_64/bin>./mysqld_safe --datadir=/Scratch/ak19/mysql/data --character_set_server=utf8mb4
     312    # mysql/mysql-5.7.23-linux-glibc2.12-x86_64/bin>./mysqld_safe --datadir=/PATHTO/mysql/data --character_set_server=utf8mb4
    331313    # AND when connecting to the server, we can can either set mysql_enable_utf8mb4 => 1
    332314    # as a connection option
     
    337319    # is no more objectionable. It has the advantage of cutting out the 2 extra lines of doing
    338320    # set NAMES '<enc>' and $dbh->{mysql_enable_utf8mb4} = 1 here.
    339     # These lines may be preferred if more db_driver options are to be supported in future:
    340     # then a separate method called set_db_encoding($enc) can work out what db_driver we're using
    341     # and if mysql and enc=utfy, then it can do the following whereas it will issue other do stmts
    342     # for other db_drivers, see https://www.perlmonks.org/?node_id=259456:
     321    # These lines may be preferred if more db_driver options are to be supported in future?
     322    # (see https://www.perlmonks.org/?node_id=259456)
    343323   
    344324    #my $stmt = "set NAMES '" . $db_enc . "'";
     
    364344    if($ref_count == 0) { # Only commit transaction when we're about to actually disconnect, not before
    365345   
    366     # + TODO: If AutoCommit was off, meaning transactions were on/enabled,
     346    # If AutoCommit was off, meaning transactions were on/enabled,
    367347    # then here is where we commit our one long transaction.
    368348    # https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#commit
     
    383363# Call this method on die(), so that you're sure the perl process has disconnected from SQL db
    384364# Disconnect from db - https://metacpan.org/pod/DBI#disconnect
    385 # TODO: make sure to have committed or rolled back before disconnect
    386 # and that you've call finish() on statement handles if any fetch remnants remain
     365# Make sure to have committed or rolled back before disconnect
    387366sub _force_disconnect_from_db {
    388367    my $self= shift (@_);
     
    403382
    404383
    405 # Load the designated database, i.e. 'use <dbname>;'.
     384# Loads the designated database, i.e. 'use <dbname>;'.
    406385# If the database doesn't yet exist, creates it and loads it.
    407386# (Don't create the collection's tables yet, though)
     
    590569# 'The q{...} style quoting used in this example avoids clashing with quotes that may be used in the SQL statement. Use the double-quote like qq{...} operator if you want to interpolate variables into the string. See "Quote and Quote-like Operators" in perlop for more details.'
    591570#
    592 # This method uses lazy loading to prepare the SQL insert stmt once for a table and store it,
    593 # then execute the (stored) statement each time it's needed for that table.
     571# Each insert method uses lazy loading to prepare the SQL insert stmts once for a table and
     572# store it, then executes the (stored) statement each time it's needed for that table.
    594573sub insert_row_into_metadata_table {
    595574    my $self = shift (@_);
     
    599578   
    600579    my $tablename = $self->get_metadata_table_name();
    601     my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)});# || warn("Could not prepare insert statement for metadata table\n");
     580    my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, metaname, metavalue) VALUES (?, ?, ?, ?)});
    602581
    603582    # Now we're ready to execute the command, unless we're only debugging
     
    626605   
    627606    my $tablename = $self->get_fulltext_table_name();
    628     my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)});# || warn("Could not prepare insert statement for fulltxt table\n");
     607    my $sth = $dbh->prepare_cached(qq{INSERT INTO $tablename (did, sid, fulltxt) VALUES (?, ?, ?)});
    629608   
    630609    # Now we're ready to execute the command, unless we're only debugging
     
    651630#   https://metacpan.org/pod/release/TIMB/DBI-1.634_50/DBI.pm#selectall_arrayref
    652631#
    653 # Returns the statement handle that prepared and executed
     632# Returns the resulting records of preparing and executing
    654633# a "SELECT * FROM <COLL>_metadata WHERE did = $oid" SQL statement.
    655 # Caller can call fetchrow_array() on returned statement handle, $sth
    656634# Have to use prepare() and execute() instead of do() since do() does
    657635# not allow for fetching result set thereafter:
     
    682660
    683661# See select_from_metatable_matching_docid() above.
    684 # Returns the statement handle that prepared and executed
    685 # a "SELECT * FROM <COLL>_metadata WHERE did = $oid" SQL statement.
    686 # Caller can call fetchrow_array() on returned statement handle, $sth
     662# Returns the resulting records from preparing and executing
     663# a "SELECT * FROM <COLL>_fulltxt WHERE did = $oid" SQL statement.
    687664sub select_from_texttable_matching_docid {
    688665    my $self= shift (@_);
     
    720697}
    721698
    722 # delete all records in metatable with specified docid
     699# delete all records in fulltxt table with the specified docid
    723700sub delete_recs_from_texttable_with_docid {
    724701    my $self= shift (@_);
     
    758735
    759736# Attempt to make sure the name parameter (for db or table name) is acceptable syntax
    760 # for the db in question, e.g. for mysql. For example, (My)SQL doesn't like tables or
     737# for the db in question, e.g. for mysql. For example, MySQL doesn't like tables or
    761738# databases with '-' (hyphens) in their names
    762739sub sanitize_name {
     
    768745
    769746
     747# MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS, using that.
     748# See https://www.perlmonks.org/bare/?node=DBI%20Recipes
     749#    The page further has a table_exists function that could work with proper comparison
     750# Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though
    770751# I can get my version of table_exists to work, but it's not so ideal
    771 # Interesting that MySQL has non-standard command to CREATE TABLE IF NOT EXISTS and DROP TABLE IF EXISTS,
    772 # see https://www.perlmonks.org/bare/?node=DBI%20Recipes
    773 #    The page further has a table_exists function that could work with proper comparison
    774 # TODO Q: Couldn't get the first solution at https://www.perlmonks.org/bare/?node_id=500050 to work though
    775752sub table_exists {
    776753    my $self = shift (@_);
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32592 r32595  
    3838
    3939
    40 # TODO:
    41 # - Run TODOs here, in Plugout and in gsmysql.pm by Dr Bainbridge.
    42 # - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
    43 # back in from the sql db while the remainder is to be read back in from the docsql .xml files.
    44 
    45 # + TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking: basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
    46 # Discuss the plugin/plugout parameters.
    47 
    48 # TODO, test on windows and mac.
    49 # Note: if parsing fails (e.g. using wrong plugout like GS XML plugout, which chokes on args intended for SQL plugout) then SQL plugin init would have already been called and done connection, but disconnect would not have been done because SQL plugin disconnect would not have been called upon parse failure.
    50 
    51 # DONE:
    52 # + TODO: For on cancel, add a SIGTERM handler or so to call end()
    53 # or to explicitly call gs_sql->close_connection if $gs_sql def
    54 #
    55 # + TODO: Incremental delete can't work until GSSQLPlugout has implemented build_mode = incremental
    56 # (instead of tossing away db on every build)
    57 # + Ask about docsql naming convention adopted to identify OID. Better way?
    58 # collection names -> table names: it seems hyphens not allowed. Changed to underscores.
    59 # + Startup parameters (except removeold/build_mode)
    60 # + how do we detect we're to do removeold during plugout in import.pl phase
    61 # + incremental building: where do we need to add code to delete rows from our sql table after
    62 # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
    63 # + Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
    64 # (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order.
    65 # YES: Otherwise for later db types (drivers), can set order by primary key column and then order by did column
    66 # + NOTTODO: when db is not running GLI is paralyzed -> can we set timeout on DBI connection attempt?
    67 #   NOT A PROBLEM: Tested to find DBI connection attempt fails immediately when MySQL server not
    68 # running. The GLI "paralyzing" incident last time was not because of the gs sql connection code,
    69 # but because my computer was freezing on-and-off.
    70 # + "Courier" demo documents in lucene-sql collection: character (degree symbol) not preserved in title. Is this because we encode in utf8 when putting into db and reading back in?
    71 # Test doc with meta and text like macron in Maori text.
    72 # + TODO Q: During import, the GS SQL Plugin is called before the GS SQL Plugout with undesirable side
    73 # effect that if the db doesn't exist, gsmysql::use_db() fails, as it won't create db.
    74 #   This got fixed when GSSQLPlugin stopped connecting on init().
    75 #
    76 #
    77 #+ TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
    78 # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is.
    79 #
    80 # + TODO Q: is "reindex" = del from db + add to db?
    81 # - is this okay for reindexing, or will it need to modify existing values (update table)
    82 # - if it's okay, what does reindex need to accomplish (and how) if the OID changes because hash id produced is different?
    83 # - delete is accomplished in GS SQL Plugin, during buildcol.pl. When should reindexing take place?
    84 # during SQL plugout/import.pl or during plugin? If adding is done by GSSQLPlugout, does it need to
    85 # be reimplemented in GSSQLPlugin to support the adding portion of reindexing.
    86 #
    87 # INCREMENTAL REBUILDING IMPLEMENTED CORRECTLY AND WORKS:
    88 # Overriding plugins' remove_all() method covered removeold.
    89 # Overriding plugins' remove_one() method is all I needed to do for reindex and deletion
    90 # (incremental and non-incremental) to work.
    91 # but doing all this needed an overhaul of gsmysql.pm and its use by the GS SQL plugin and plugout.
    92 # - needed to correct plugin.pm::remove_some() to process all files
    93 # - and needed to correct GreenstoneSQLPlugin::close_document() to setOID() after all
    94 # All incremental import and buildcol worked after that:
    95 # - deleting files and running incr-import and incr-buildcol (= "incr delete"),
    96 # - deleting files and running incr-import and buildcol (="non-incr delete")
    97 # - modifying meta and doing an incr rebuild
    98 # - modifying fulltext and doing an incr rebuild
    99 # - renaming a file forces a reindex: doc is removed from db and added back in, due to remove_one()
    100 # - tested CSV file: adding some records, changing some records
    101 #    + CSVPlugin test (collection csvsql)
    102 #    + MetadataCSVPlugin test (modified collection sqltest to have metadata.csv refer to the
    103 #      filenames of sqltest's documents)
    104 #    + shared image test (collection shareimg): if 2 html files reference the same image, the docs
    105 #      are indeed both reindexed if the image is modified (e.g. I replaced the image with another
    106 #      of the same name) which in the GS SQL plugin/plugout case is that the 2 docs are deleted
    107 #      and added in again.
    10840
    10941########################################################################################
    11042
    111 # GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
     43# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext
    11244# is still written out to doc.xml (docsql .xml), that will be processed as usual,
    11345# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
     
    14577        'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ];
    14678
    147 # TODO: If subclassing gsmysql for other supporting databases and if they have different required
     79# NOTE: If subclassing gsmysql for other supporting databases and if they have different required
    14880# connection parameters, we can check how WordPlugin, upon detecting Word is installed,
    14981# dynamically loads Word specific configuration options.
     
    225157}
    226158
    227 # GS SQL Plugin::init() (and deinit()) is called by import.pl and also by buildcol.pl
    228 # This means it connects and deconnects during import.pl as well. This is okay
    229 # as removeold, which should drop the collection tables, happens during the import phase,
    230 # calling GreenstoneSQLPlugin::and therefore also requires a db connection.
    231 # + TODO: Eventually can try moving get_gssql_instance into gsmysql.pm? That way both GS SQL Plugin
    232 # and Plugout would be using one connection during import.pl phase when both plugs exist.
    233 
    234159# Call init() not begin() because there can be multiple plugin passes and begin() called for
    235160# each pass (one for doc level and another for section level indexing), whereas init() should
    236161# be called before any and all passes.
    237162# This way, we can connect to the SQL database once per buildcol run.
     163# Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a
     164# singleton db connection, regardless of the number of gsmysql objects instantiated and
     165# the number of connect() calls made on them.
    238166sub init {
    239167    my ($self) = shift (@_);
    240     ##print STDERR "@@@@@@@@@@ INIT CALLED\n";
    241168   
    242169    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
    243170
    244     ####################
    245 #    print "@@@ SITE NAME: ". $self->{'site'} . "\n" if defined $self->{'site'};
    246 #    print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
    247 
    248 #    print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
    249 #    print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
    250 #    print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
    251 #    print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
    252     ####################
    253171
    254172    # create gsmysql object.
     
    287205    # PrintError would already have displayed the warning message on load fail
    288206    # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
    289     #$gs_sql->force_disconnect_from_db();
    290207    die("Could not use db $db_name. Can't proceed.\n");
    291208    }
     
    297214
    298215
    299 # This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
     216# This method also runs on import.pl if gs_sql has a value.
    300217# Call deinit() not end() because there can be multiple plugin passes:
    301218# one for doc level and another for section level indexing
    302219# and deinit() should be called before all passes
    303220# This way, we can close the SQL database once per buildcol run.
     221# Again, this doesn't matter because we gsmysql the ensures the connection
     222# is a singleton connection instance, which connects once and disconnects once per perl process.
    304223sub deinit {
    305224    my ($self) = shift (@_);
    306    
    307     ##print STDERR "@@@@@@@@@@ GreenstoneSQLPlugin::DEINIT CALLED\n";
    308225   
    309226    if($self->{'gs_sql'}) {
     
    360277# but if a plugin does any stuff outside of creating doc obj, then it may need
    361278# to clear something.
    362 # remove the doc(s) denoted by oids from GS SQL db
     279# In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db.
    363280# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building
    364 # incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm
     281# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm.
     282# As well as cases involving reindexing, which are implemented here as delete followed by add.
    365283sub remove_one {
    366284    my $self = shift (@_);
     
    379297   
    380298    my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc
    381 
    382     print STDERR "*****************************\nAsked to remove_one oid\n***********************\n";
    383     print STDERR "Num oids: " . scalar (@$oids) . "\n";
    384299   
    385300    my $proc_mode = $self->{'process_mode'};
     
    431346# Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory
    432347
    433 # We should only ever get here during the buildcol.pl phase
     348# We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase.
    434349# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
    435350# the doc_obj in memory is processed (indexed) and then made undef.
     
    438353    my $self = shift(@_);
    439354
    440     ##print STDERR "XXXXXXXXX in SQLPlugin::close_doc()\n";
    441    
    442355    my $gs_sql = $self->{'gs_sql'};
    443356   
     
    468381   
    469382    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
    470     # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
    471383
    472384    foreach my $row (@$records) {
    473         #print $outhandle "row: @$row\n";
    474385        my ($primary_key, $did, $sid, $metaname, $metaval) = @$row;
    475386       
     
    480391        if $self->{'verbosity'} > 2;
    481392       
    482         # + TODO:  we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
    483         #$doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
    484        
    485         # data stored unescaped in db: escaping only for html/xml files, not for txt files or db
     393        # We're only dealing with utf8 data where docobj is concerned
     394        # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db
    486395        $doc_obj->add_utf8_metadata($sid, $metaname, $metaval);
    487396    }
     
    510419        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
    511420        if $self->{'verbosity'} > 2;
    512        
    513         # TODO - pass by ref?
    514         # + TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
    515         # data stored unescaped in db: escaping is only for html/xml files, not for txt files or db
    516         #my $textref = &docprint::unescape_textref(\$text);
     421
     422        # We're only dealing with utf8 data where docobj is concerned
     423        # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db
    517424        $doc_obj->add_utf8_textref($sid, \$text);
    518425    }   
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32592 r32595  
    3939
    4040
    41 # + TODO: SIGTERM rollback and disconnect?
    42 # + TODO Q: what about verbosity for debugging, instead of current situation of printing out upon debug set at the expense of writing to db
    43 #+ TODO Q: introduced site param to plugins and plugouts. Did I do it right? And should they have hiddengli = "yes". No longer a param
    44 # !!!! Did I do the pass by ref in docprint's escape and unescape textref functions correctly, and how they're called here?
    45 #   Any more optimisation I can do around this?
    46 
    47 # this plugout does not output the metadata and/or fulltxt xml to a file,
    48 # but outputs rows into a mysql table for metadata and/or a table for fulltxt
     41# This plugout does not output the metadata and/or fulltxt xml to a file,
     42# but outputs rows into a MySQL db table for metadata and/or a db table for fulltxt
     43
    4944sub BEGIN {
    5045    @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
    5146}
    5247
    53 # + NOTTODO: die() statements need to be replaced with premature_termination
    54 # which should ensure the GreenstoneXMLPlugin (group)'s stuff is closed and cleaned up SOMEHOW
    55 # It's fine: the die() stmts all take place before setting up the super class' begin
    56 
    57 # + TODO Q: about build_mode: how to detect removeold. Now handled by
    58 #   GreenstoneSQLPlugout::remove_all(), which is inherited from a base plugin.
    59 # + TODO: deal with -removeold and everything? Or type out instructions for user
    60 
    61 # + TODO Q: what is "group" in GreenstoneXMLPlugout?
    6248
    6349my $process_mode_list =
     
    141127        return bless $self, $class;
    142128    }
    143     #print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
    144129   
    145130    return bless $self, $class;
     
    151136    my $self= shift (@_);
    152137   
    153     # The saveas.options
    154     #print STDERR "@@@@ PLUGOUT db_pwd: " . $self->{'db_client_pwd'} . "\n";
    155     #print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
    156     #print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
    157     #print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
    158     #print STDERR "@@@@ proc_mode: " . $self->{'process_mode'} . "\n";
    159 
    160     ############ LOAD NECESSARY OPTIONS ###########
    161     #print "@@@ plugout SITE NAME: ". $self->{'site'} . "\n" if defined $self->{'site'};
    162     #print STDERR "########## COLLECTION: ". $ENV{'GSDLCOLLECTION'}."\n";
    163 
    164     print STDERR "***** GreenstoneSQLPlugout process mode = \"", $self->{'process_mode'}, "\"\n";
    165 
    166138    my $db_params = {
    167139    'collection_name' => $ENV{'GSDLCOLLECTION'},
     
    192164    }
    193165
    194     #die("@@@@ TEST. Connected successfully. Testing gsmysql::destructor.\n"); # WORKS
    195    
    196166    my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
    197167    my $proc_mode = $self->{'process_mode'};
     
    201171   
    202172    if($success && $proc_mode ne "text_only") {
    203     ##print STDERR "@@@@ Ensuring meta table exists\n";
    204173    $success = $gs_sql->ensure_meta_table_exists();
    205174    }   
    206175    if($success && $proc_mode ne "meta_only") {
    207     ##print STDERR "@@@@ Ensuring fulltxt table exists\n";
    208176    $success = $gs_sql->ensure_fulltxt_table_exists();
    209177    }
     
    213181    # PrintError would already have displayed the warning message on load fail
    214182    # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
    215     #$gs_sql->force_disconnect_from_db(); # disconnect_from_db() will issue a warning on error
    216183    die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
    217184    }
     
    221188   
    222189   
    223     # if setting up to work with sql db failed, we'd have terminated and wouldn't come up to here:
    224     # won't bother preparing GreenstoneXMLPlugout by calling superclass' begin()
    225     # finally, can call begin on super - important as doc.xml is opened as a group etc
     190    # If setting up connection to sql db failed, we'd have terminated and wouldn't come up to here
     191    # and wouldn't have bothered preparing GreenstoneXMLPlugout by calling superclass' begin().
     192    # Finally, can call begin on super - important as doc.xml is opened as a group etc
    226193   
    227194    $self->SUPER::begin(@_);
     
    261228}
    262229 
    263 # + X TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed?
    264 # That's only for indexing, not for this step which only generates the content in archives dir
     230# saveas() only generates the content in archives dir and in the SQL database
    265231sub saveas {
    266232    my $self = shift (@_);
     
    276242    $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
    277243
    278     # TODO: also set debugging in begin()? Then stmts creating db and tables also sent to debug out and not executed
    279    
    280    
    281244    # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files,
    282245    # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db   
     
    308271   
    309272   
    310     # database connection is closed in end() method
    311     # so we don't open and close over and over for each doc during a single build
     273    # database connection is closed once, in end() method
     274    # We're not opening and closing over and over for each doc during a single build
    312275}
    313276
     
    324287}
    325288
    326 # Perl: Reading or Writing to Another Program
    327 # https://nnc3.com/mags/Perl3/cookbook/ch16_05.htm
    328289sub recursive_write_meta_and_text {
    329290    my $self = shift (@_);
     
    345306    foreach my $data (@{$section_ptr->{'metadata'}}) {
    346307        my $meta_name = $data->[0];
    347         # TODO: does it need to be stored escaped, as it requires unescaping when read back in
    348         # from db (unlike for reading back in from doc.xml)
    349308
    350309        # Treat db like a text file instead of an html/xml file: don't need to escape text
    351         # going into it
    352         #my $escaped_meta_value = &docprint::escape_text($data->[1]);
     310        # going into it, unlike with doc(sql).xml
    353311        my $meta_value = $data->[1];
    354312       
     
    359317        # filling in the values
    360318        # OR if debugging, then it will print the SQL insert statement but not execute it
     319        # (a behaviour following what the GS XML Plugout superclass does on debug)
    361320       
    362321        $gs_sql->insert_row_into_metadata_table($doc_oid, $section_name, $meta_name, $meta_value, $self->{'debug'});
     
    368327
    369328    # See above, no need to html-escape for db
    370     my $section_text = $section_ptr->{'text'}; #&docprint::escape_textref(\$section_ptr->{'text'});
     329    my $section_text = $section_ptr->{'text'};
    371330   
    372     # fulltxt column can be SQL NULL. undef value gets written out as NULL:
     331    # fulltxt column can be SQL NULL. undef value for $section_text gets written out as NULL:
    373332    # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
    374333    # The following will do the SQL insertion
Note: See TracChangeset for help on using the changeset viewer.