Changeset 32536

Show
Ignore:
Timestamp:
24.10.2018 20:11:33 (4 weeks ago)
Author:
ak19
Message:

First commit to do with reading back in from the SQL DB. This commit introduces the new GreenstoneSQLPlugin for this purpose, which should ideally only be used during buildcol (but its init(), deinit() and read() methods are also called on import.pl). The new plugin works with GreenstoneSQLPlugout which wrote meta and txt to the SQL DB. Lots of TODOs and questions still here, some debug statements too. Also have to run some decisions by Dr Bainbridge. There are many hardcoded values which still have to be parameterised (not always completely sure how) and still have to test the 2 cases of sending just meta and just fulltxt to db. Next commit will tidy some things up.

Location:
main/trunk/greenstone2/perllib
Files:
1 added
5 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/docprint.pm

    r32532 r32536  
    2727# Greenstone XML format. 
    2828# This is used by GreenstoneXMLPlugout and doc.pm 
     29# It's also used by GreenstoneSQLPlugout to output a doc_obj's meta and/or 
     30# fulltxt to an SQL db, while subroutine  unescape_text() is used by 
     31# GreenstoneSQLPlugin for unescaping txt read back in. 
    2932 
    3033package docprint; 
     
    117120} 
    118121 
     122# used by GreenstoneSQLPlugin when reading back from sqldb 
     123sub unescape_text { 
     124    my ($text) = @_; 
     125    # special characters in the xml encoding 
     126    $text =~ s/& &/&&/g; 
     127    $text =~ s/&/&/g; # this has to be first... 
     128    $text =~ s/&lt;/</g; 
     129    $text =~ s/&gt;/>/g; 
     130    $text =~ s/&quot;/"/g; 
     131 
     132    return $text; 
     133} 
     134 
    1191351; 
  • main/trunk/greenstone2/perllib/gssql.pm

    r32531 r32536  
    3131no strict 'subs'; 
    3232 
    33 use GreenstoneXMLPlugout; 
    34 use docprint; 
    35  
    36 use DBI; # the central package for this plugout 
     33use DBI; # the central package for this module used by GreenstoneSQL Plugout and Plugin 
    3734 
    3835# Need params_map keys: 
     
    151148} 
    152149 
     150# will attempt to load the specified db and the <coll>_metadata and <coll>_fulltxt for this 
     151# collection, or create any of these (db, tables) that don't yet exist. At the end 
     152# it will have loaded the requested database (in MySQL: "use <db>;") 
    153153sub load_db_and_tables { 
    154154    my $self= shift (@_); 
     
    205205} 
    206206 
    207 # this will terminate if the db does not exist 
    208 # it will not attempt to create the requested db (nor its tables) 
    209 # The upcoming GreenstoneSQLPlugin can use this. 
     207# GreenstoneSQLPlugin calls this method to load an existing db. 
     208# This will terminate if the db does not exist. Unlike load_db_and_tables() above, used by 
     209# GreenstoneSQLPlugout, this method will not attempt to create the requested db (nor its tables) 
    210210sub use_db { 
    211211    my $self= shift (@_); 
     
    370370} 
    371371 
     372# returns database handle, dbh 
     373sub get_db_handle { 
     374    my $self= shift (@_); 
     375    return $self->{'db_handle'}; 
     376} 
     377 
    372378 
    373379# I can get my version of table_exists to work, but it's not so ideal 
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32533 r32536  
    4040 
    4141# TODO: SIGTERM rollback and disconnect? 
    42  
     42# TODO Q: what about verbosity for debugging 
    4343 
    4444# this plugout does not output xml to a file, but outputs rows into a mysql table 
     
    146146    if(!$gs_sql->load_db_and_tables($db_name, $build_mode)) { 
    147147     
    148     # This is fatal for the plugout, let's terminate here 
    149     # PrintError would already have displayed the warning message on connection fail     
    150     die("Could not use db or prepare its tables. Can't proceed.\n"); 
     148    # This is fatal for the plugout, let's terminate here after disconnecting again 
     149    # PrintError would already have displayed the warning message on load fail 
     150    $gs_sql->disconnect_from_db() 
     151        || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); 
     152    die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n"); 
    151153    } 
    152154 
     
    184186    $self->{'gs_sql'}->disconnect_from_db() || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n"); # disconnect_from_db() will also issue a warning, but this may be clearer 
    185187} 
    186   
    187 # produce files called docsql.xml instead of doc.xml 
    188 sub get_short_doc_file { 
     188 
     189# produce files called docsql-<OID>.xml instead of doc.xml 
     190sub get_doc_xml_filename { 
    189191    my $self = shift (@_); 
    190     my ($doc_dir) = @_; 
    191     return &FileUtils::filenameConcatenate($doc_dir, "docsql.xml"); 
    192 } 
    193  
    194 # produce files called docsql.xml instead of doc.xml 
    195 sub get_output_file { 
    196     my $self = shift (@_); 
    197     my ($doc_dir) = @_; 
    198     return &FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "docsql.xml"); 
     192    my ($doc_obj) = @_; 
     193     
     194    my $doc_id = $doc_obj->get_OID(); 
     195    my $filename = $doc_id ? "docsql-".$doc_id.".xml" : "docsql.xml"; 
     196    return $filename; 
    199197} 
    200198    
     
    204202    my ($doc_obj, $doc_dir) = @_; 
    205203 
    206     print STDERR "\n\n@@@ In saveas\n\n"; 
     204#    print STDERR "\n\n@@@ In saveas\n\n"; 
    207205     
    208206    my $proc_mode = $self->{'process_mode'}; 
     
    291289    foreach my $data (@{$section_ptr->{'metadata'}}) { 
    292290        my $meta_name = $data->[0]; 
     291        # TODO: does it need to be stored escaped, as it requires unescaping when read back in 
     292        # from db (unlike for reading back in from doc.xml) 
    293293        my $escaped_meta_value = &docprint::escape_text($data->[1]); 
    294294 
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneXMLPlugout.pm

    r32534 r32536  
    9494        $outhandler = $self->{'group_outhandler'}; 
    9595    } else { 
    96         $output_file = $self->get_output_file($doc_dir); #&FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml"); 
     96        $output_file = &FileUtils::filenameConcatenate( 
     97        $self->{'output_dir'}, $doc_dir, $self->get_doc_xml_filename($doc_obj)); 
    9798        # open the new handle 
    9899        $self->open_xslt_pipe($output_file, $self->{'xslt_file'}); 
     
    134135    } 
    135136    } 
    136     $self->{'short_doc_file'} = $self->get_short_doc_file($doc_dir); #&FileUtils::filenameConcatenate($doc_dir, "doc.xml");   
     137    $self->{'short_doc_file'} = &FileUtils::filenameConcatenate( 
     138    $doc_dir, $self->get_doc_xml_filename($doc_obj));   
    137139     
    138140    $self->store_output_info_reference($doc_obj); 
     
    142144# can be overridden in subclasses, for instance by GreenstoneSQLPlugout, to produce a different filename 
    143145# like docsql.xml 
    144 sub get_short_doc_file { 
    145     my $self = shift (@_); 
    146     my ($doc_dir) = @_; 
    147     return &FileUtils::filenameConcatenate($doc_dir, "doc.xml"); 
    148 } 
    149  
    150 # can be overridden in subclasses, for instance by GreenstoneSQLPlugout, to produce a different filename 
    151 # like docsql.xml 
    152 sub get_output_file { 
    153     my $self = shift (@_); 
    154     my ($doc_dir) = @_; 
    155     return &FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml"); 
     146sub get_doc_xml_filename { 
     147    my $self = shift (@_); 
     148    my ($doc_obj) = @_; 
     149    return "doc.xml"; 
    156150} 
    157151 
     
    175169        $outhandler = $self->{'group_outhandler'}; 
    176170    } else { 
    177         $output_file = $self->get_output_file($doc_dir); #&FileUtils::filenameConcatenate($self->{'output_dir'}, $doc_dir, "doc.xml"); 
     171        $output_file = $output_file = &FileUtils::filenameConcatenate( 
     172        $self->{'output_dir'}, $doc_dir, $self->get_doc_xml_filename($doc_obj)); 
    178173        # open the new handle 
    179174        $self->open_xslt_pipe($output_file, $self->{'xslt_file'}); 
     
    235230    } 
    236231    } 
    237     $self->{'short_doc_file'} = $self->get_short_doc_file($doc_dir); #&FileUtils::filenameConcatenate($doc_dir, "doc.xml");   
     232    $self->{'short_doc_file'} = &FileUtils::filenameConcatenate( 
     233    $doc_dir, $self->get_doc_xml_filename($doc_obj)); 
    238234     
    239235    $self->store_output_info_reference($doc_obj);     
  • main/trunk/greenstone2/perllib/strings.properties

    r32528 r32536  
    931931GreenstoneXMLPlugin.desc:Processes Greenstone Archive XML documents. Note that this plugin does no syntax checking (though the XML::Parser module tests for well-formedness). It's assumed that the Greenstone Archive files conform to their DTD. 
    932932 
     933GreenstoneSQLPlugin.desc:Processes the contents of a Greenstone SQL database for metadata and/or full text of documents, and processes Greenstone Archive XML documents for the part of that that's not in the database and for document structure. Note that this plugin does no syntax checking (though the XML::Parser module tests for well-formedness). It's assumed that the Greenstone Archive files conform to their DTD. 
     934 
    933935GISExtractor.desc:Helper extractor plugin for extracting placenames from text. Requires GIS extension to Greenstone. 
    934936