Changeset 32542 for main/trunk


Ignore:
Timestamp:
2018-10-25T21:17:02+13:00 (5 years ago)
Author:
ak19
Message:

Instead of the docoid being stored in the docsql-<OID>.xml filename, all filenames produced are back to being docsql.xml, but the root element Archive now contains the doc oid as attribute: <Archive docoid="oid">

Location:
main/trunk/greenstone2/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32541 r32542  
    11###########################################################################
    22#
    3 # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml
     3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
    44# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
    55# the docsql .xml files.
     
    4242# Ask about docsql naming convention adopted to identify OID. Better way?
    4343# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
    44 # - Startup parameters
     44# + Startup parameters (except removeold/build_mode)
    4545# - incremental building: where do we need to add code to delete rows from our sql table after
    4646# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
     
    6363# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
    6464
     65# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work.
     66# Discuss the plugin/plugout parameters.
     67
    6568sub BEGIN {
    6669    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
     
    6972# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
    7073# So we won't have a process exp conflict here.
     74# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
     75#   - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
     76#   this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
     77#   - the root element Archive now has a docoid attribute: <Archive docoid="OID">
    7178sub get_default_process_exp {
    7279    my $self = shift (@_);
    7380
    74     #return q^(?i)docsql(-\d+)?\.xml$^;
    75     return q^(?i)docsql(-.+)?\.xml$^;
     81    return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
     82    #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename
    7683}
    7784
     
    156163}
    157164
     165sub xml_start_tag {
     166    my $self = shift(@_);
     167    my ($expat, $element) = @_;
     168
     169    my $outhandle = $self->{'outhandle'};
     170   
     171    $self->{'element'} = $element;
     172    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
     173    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
     174    # contains the tag)
     175
     176    # Don't access %_{'docoid'} directly: keep getting a warning message to
     177    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
     178    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
     179    my %attr_hash = %_; # right way, see OAIPlugin.pm
     180    $self->{'doc_oid'} = $attr_hash{'docoid'}; 
     181    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
     182        if $self->{'verbosity'} > 1;
     183
     184    }
     185    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
     186    $self->SUPER::xml_start_tag(@_);
     187    }
     188}
    158189
    159190# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
     
    169200    my $gs_sql = $self->{'gs_sql'};
    170201
    171     my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()
     202    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
    172203    print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n"
    173204    if $self->{'verbosity'} > 1;
     
    325356}
    326357
    327 sub read {
    328     my $self = shift (@_); 
    329  
    330     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
    331 
    332     # when running buildcol.pl, the filename should match "docsql-<OID>.xml"
    333     # when running import.pl it will be the original document's filename
    334     # we only want to read in from db when running buildcol.pl
    335 
    336     # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)
    337     # is created in super::open_document() and is made undef again on super::close_document().
    338     # Further, can't read it in from doc.xml to work out which OID to query in sql db:
    339     # even if we got access to doc_obj, if no meta stored in docsql.xml, then when
    340     # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename
    341     # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self
    342     if($file =~ m/docsql-(.+?)\.xml$/) {
    343 
    344     # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing
    345     # subfolder besides filename, e.g. "dir/docsql-<OID>.xml"
    346    
    347     # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original
    348     (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;
    349 
    350     $self->{'doc_oid'} = $oid;
    351     }   
    352 
    353    
    354     # always read docsql.xml, as we then know doc structure, and assoc files are dealt with
    355     # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db
    356     return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj
    357 
    358    
    359 }
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32541 r32542  
    5252# It's fine: the die() stmts all take place before setting up the super class' begin
    5353
     54# TODO Q: about build_mode: how to detect removeold
    5455# TODO: deal with -removeold and everything? Or type out instructions for user
    5556
    5657# TODO Q: what is "group" in GreenstoneXMLPlugout?
    5758# TODO Q: site_name only exists for GS3. What about GS2?
     59
    5860
    5961my $process_mode_list =
     
    216218}
    217219
    218 # produce files called docsql-<OID>.xml instead of doc.xml
     220# Produce files called docsql.xml instead of doc.xml
    219221sub get_doc_xml_filename {
    220222    my $self = shift (@_);
    221223    my ($doc_obj) = @_;
    222224   
    223     my $doc_id = $doc_obj->get_OID();
    224     my $filename = $doc_id ? "docsql-".$doc_id.".xml" : "docsql.xml";
    225     return $filename;
    226 }
    227    
     225    return "docsql.xml";
     226}
     227
     228# overriding to store doc OID as attribute of top level element: <Archive docoid="oid">
     229sub output_xml_header {
     230    my $self = shift (@_);
     231    my ($outhandle, $doc_oid) = @_;
     232
     233    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
     234    print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
     235    print $outhandle "<Archive docoid=\"$doc_oid\">\n";
     236}
     237 
    228238# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed
    229239sub saveas {
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneXMLPlugout.pm

    r32536 r32542  
    190190    # only output the header if we have started a new doc
    191191    if (!$self->is_group() || $self->{'new_doc_dir'}) {
    192     $self->output_xml_header($outhandler);
     192    $self->output_xml_header($outhandler, $doc_obj->get_OID());
    193193    }
    194194 
     
    238238sub output_xml_header {
    239239    my $self = shift (@_);
    240     my ($outhandle) = @_;
     240    my ($outhandle, $doc_oid) = @_;
    241241
    242242    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
Note: See TracChangeset for help on using the changeset viewer.