Changeset 32536


Ignore:
Timestamp:
2018-10-24T20:11:33+13:00 (5 years ago)
Author:
ak19
Message:

First commit to do with reading back in from the SQL DB. This commit introduces the new GreenstoneSQLPlugin for this purpose, which should ideally only be used during buildcol (but its init(), deinit() and read() methods are also called on import.pl). The new plugin works with GreenstoneSQLPlugout which wrote meta and txt to the SQL DB. Lots of TODOs and questions still here, some debug statements too. Also have to run some decisions by Dr Bainbridge. There are many hardcoded values which still have to be parameterised (not always completely sure how) and still have to test the 2 cases of sending just meta and just fulltxt to db. Next commit will tidy some things up.

Location:
main/trunk/greenstone2/perllib
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/docprint.pm

    r32532 r32536  
    2727# Greenstone XML format.
    2828# This is used by GreenstoneXMLPlugout and doc.pm
     29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or
     30# fulltxt to an SQL db, while subroutine  unescape_text() is used by
     31# GreenstoneSQLPlugin for unescaping txt read back in.
    2932
    3033package docprint;
     
    117120}
    118121
     122# used by GreenstoneSQLPlugin when reading back from sqldb
     123sub unescape_text {
     124    my ($text) = @_;
     125    # special characters in the xml encoding
     126    $text =~ s/& &/&&/g;
     127    $text =~ s/&/&/g; # this has to be first...
     128    $text =~ s/&lt;/</g;
     129    $text =~ s/&gt;/>/g;
     130    $text =~ s/&quot;/"/g;
     131
     132    return $text;
     133}
     134
    1191351;
  • main/trunk/greenstone2/perllib/gssql.pm

    r32531 r32536  
    3131no strict 'subs';
    3232
    33 use GreenstoneXMLPlugout;
    34 use docprint;
    35 
    36 use DBI; # the central package for this plugout
     33use DBI; # the central package for this module used by GreenstoneSQL Plugout and Plugin
    3734
    3835# Need params_map keys:
     
    151148}
    152149
     150# will attempt to load the specified db and the <coll>_metadata and <coll>_fulltxt for this
     151# collection, or create any of these (db, tables) that don't yet exist. At the end
     152# it will have loaded the requested database (in MySQL: "use <db>;")
    153153sub load_db_and_tables {
    154154    my $self= shift (@_);
     
    205205}
    206206
    207 # this will terminate if the db does not exist
    208 # it will not attempt to create the requested db (nor its tables)
    209 # The upcoming GreenstoneSQLPlugin can use this.
     207# GreenstoneSQLPlugin calls this method to load an existing db.
     208# This will terminate if the db does not exist. Unlike load_db_and_tables() above, used by
     209# GreenstoneSQLPlugout, this method will not attempt to create the requested db (nor its tables)
    210210sub use_db {
    211211    my $self= shift (@_);
     
    370370}
    371371
     372# returns database handle, dbh
     373sub get_db_handle {
     374    my $self= shift (@_);
     375    return $self->{'db_handle'};
     376}
     377
    372378
    373379# I can get my version of table_exists to work, but it's not so ideal
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32533 r32536  
    4040
    4141# TODO: SIGTERM rollback and disconnect?
    42 
     42# TODO Q: what about verbosity for debugging
    4343
    4444# this plugout does not output xml to a file, but outputs rows into a mysql table
     
    146146    if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) {
    147147   
    148     # This is fatal for the plugout, let's terminate here
    149     # PrintError would already have displayed the warning message on connection fail   
    150     die("Could not use db or prepare its tables. Can't proceed.\n");
     148    # This is fatal for the plugout, let's terminate here after disconnecting again
     149    # PrintError would already have displayed the warning message on load fail
     150    $gs_sql->disconnect_from_db()
     151        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
     152    die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
    151153    }
    152154
     
    184186    $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer
    185187}
    186  
    187 # produce files called docsql.xml instead of doc.xml
    188 sub get_short_doc_file {
     188
     189# produce files called docsql-<OID>.xml instead of doc.xml
     190sub get_doc_xml_filename {
    189191    my $self = shift (@_);
    190     my ($doc_dir) = @_;
    191     return &FileUtils::filenameConcatenate($doc_dir, "docsql.xml");
    192 }
    193 
    194 # produce files called docsql.xml instead of doc.xml
    195 sub get_output_file {
    196     my $self = shift (@_);
    197     my ($doc_dir) = @_;
    198     return &FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "docsql.xml");
     192    my ($doc_obj) = @_;
     193   
     194    my $doc_id = $doc_obj->get_OID();
     195    my $filename = $doc_id ? "docsql-".$doc_id.".xml" : "docsql.xml";
     196    return $filename;
    199197}
    200198   
     
    204202    my ($doc_obj, $doc_dir) = @_;
    205203
    206     print STDERR "\n\n@@@ In saveas\n\n";
     204#    print STDERR "\n\n@@@ In saveas\n\n";
    207205   
    208206    my $proc_mode = $self->{'process_mode'};
     
    291289    foreach my $data (@{$section_ptr->{'metadata'}}) {
    292290        my $meta_name = $data->[0];
     291        # TODO: does it need to be stored escaped, as it requires unescaping when read back in
     292        # from db (unlike for reading back in from doc.xml)
    293293        my $escaped_meta_value = &docprint::escape_text($data->[1]);
    294294
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneXMLPlugout.pm

    r32534 r32536  
    9494        $outhandler = $self->{'group_outhandler'};
    9595    } else {
    96         $output_file = $self->get_output_file($doc_dir); #&FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml");
     96        $output_file = &FileUtils::filenameConcatenate(
     97        $self->{'output_dir'}, $doc_dir, $self->get_doc_xml_filename($doc_obj));
    9798        # open the new handle
    9899        $self->open_xslt_pipe($output_file, $self->{'xslt_file'});
     
    134135    }
    135136    }
    136     $self->{'short_doc_file'} = $self->get_short_doc_file($doc_dir); #&FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 
     137    $self->{'short_doc_file'} = &FileUtils::filenameConcatenate(
     138    $doc_dir, $self->get_doc_xml_filename($doc_obj)); 
    137139   
    138140    $self->store_output_info_reference($doc_obj);
     
    142144# can be overridden in subclasses, for instance by GreenstoneSQLPlugout, to produce a different filename
    143145# like docsql.xml
    144 sub get_short_doc_file {
    145     my $self = shift (@_);
    146     my ($doc_dir) = @_;
    147     return &FileUtils::filenameConcatenate($doc_dir, "doc.xml");
    148 }
    149 
    150 # can be overridden in subclasses, for instance by GreenstoneSQLPlugout, to produce a different filename
    151 # like docsql.xml
    152 sub get_output_file {
    153     my $self = shift (@_);
    154     my ($doc_dir) = @_;
    155     return &FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml");
     146sub get_doc_xml_filename {
     147    my $self = shift (@_);
     148    my ($doc_obj) = @_;
     149    return "doc.xml";
    156150}
    157151
     
    175169        $outhandler = $self->{'group_outhandler'};
    176170    } else {
    177         $output_file = $self->get_output_file($doc_dir); #&FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml");
     171        $output_file = $output_file = &FileUtils::filenameConcatenate(
     172        $self->{'output_dir'}, $doc_dir, $self->get_doc_xml_filename($doc_obj));
    178173        # open the new handle
    179174        $self->open_xslt_pipe($output_file, $self->{'xslt_file'});
     
    235230    }
    236231    }
    237     $self->{'short_doc_file'} = $self->get_short_doc_file($doc_dir); #&FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 
     232    $self->{'short_doc_file'} = &FileUtils::filenameConcatenate(
     233    $doc_dir, $self->get_doc_xml_filename($doc_obj));
    238234   
    239235    $self->store_output_info_reference($doc_obj);   
  • main/trunk/greenstone2/perllib/strings.properties

    r32528 r32536  
    931931GreenstoneXMLPlugin.desc:Processes Greenstone Archive XML documents. Note that this plugin does no syntax checking (though the XML::Parser module tests for well-formedness). It's assumed that the Greenstone Archive files conform to their DTD.
    932932
     933GreenstoneSQLPlugin.desc:Processes the contents of a Greenstone SQL database for metadata and/or full text of documents, and processes Greenstone Archive XML documents for the part of that that's not in the database and for document structure. Note that this plugin does no syntax checking (though the XML::Parser module tests for well-formedness). It's assumed that the Greenstone Archive files conform to their DTD.
     934
    933935GISExtractor.desc:Helper extractor plugin for extracting placenames from text. Requires GIS extension to Greenstone.
    934936
Note: See TracChangeset for help on using the changeset viewer.