Changeset 32542

Show
Ignore:
Timestamp:
25.10.2018 21:17:02 (4 weeks ago)
Author:
ak19
Message:

Instead of the docoid being stored in the docsql-<OID>.xml filename, all filenames produced are back to being docsql.xml, but the root element Archive now contains the doc oid as attribute: <Archive docoid="oid">

Location:
main/trunk/greenstone2/perllib
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32541 r32542  
    11########################################################################### 
    22# 
    3 # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml 
     3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml 
    44# Metadata and/or fulltext are stored in SQL db, the rest may be stored in 
    55# the docsql .xml files. 
     
    4242# Ask about docsql naming convention adopted to identify OID. Better way? 
    4343# collection names -> table names: it seems hyphens not allowed. Changed to underscores. 
    44 # - Startup parameters 
     44# + Startup parameters (except removeold/build_mode) 
    4545# - incremental building: where do we need to add code to delete rows from our sql table after 
    4646# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? 
     
    6363# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is 
    6464 
     65# TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 
     66# Discuss the plugin/plugout parameters. 
     67 
    6568sub BEGIN { 
    6669    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); 
     
    6972# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin 
    7073# So we won't have a process exp conflict here. 
     74# The structure of docsql.xml files is identical to doc.xml and the contents are similar except: 
     75#   - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating 
     76#   this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt) 
     77#   - the root element Archive now has a docoid attribute: <Archive docoid="OID"> 
    7178sub get_default_process_exp { 
    7279    my $self = shift (@_); 
    7380 
    74     #return q^(?i)docsql(-\d+)?\.xml$^; 
    75     return q^(?i)docsql(-.+)?\.xml$^; 
     81    return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin 
     82    #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename 
    7683} 
    7784 
     
    156163} 
    157164 
     165sub xml_start_tag { 
     166    my $self = shift(@_); 
     167    my ($expat, $element) = @_; 
     168 
     169    my $outhandle = $self->{'outhandle'}; 
     170     
     171    $self->{'element'} = $element; 
     172    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element 
     173    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ 
     174    # contains the tag) 
     175 
     176    # Don't access %_{'docoid'} directly: keep getting a warning message to  
     177    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap  
     178    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. 
     179    my %attr_hash = %_; # right way, see OAIPlugin.pm 
     180    $self->{'doc_oid'} = $attr_hash{'docoid'};   
     181    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" 
     182        if $self->{'verbosity'} > 1; 
     183 
     184    } 
     185    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements 
     186    $self->SUPER::xml_start_tag(@_); 
     187    } 
     188} 
    158189 
    159190# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)? 
     
    169200    my $gs_sql = $self->{'gs_sql'}; 
    170201 
    171     my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()  
     202    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() 
    172203    print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n" 
    173204    if $self->{'verbosity'} > 1; 
     
    325356} 
    326357 
    327 sub read { 
    328     my $self = shift (@_);   
    329    
    330     my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 
    331  
    332     # when running buildcol.pl, the filename should match "docsql-<OID>.xml" 
    333     # when running import.pl it will be the original document's filename 
    334     # we only want to read in from db when running buildcol.pl 
    335  
    336     # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj) 
    337     # is created in super::open_document() and is made undef again on super::close_document(). 
    338     # Further, can't read it in from doc.xml to work out which OID to query in sql db: 
    339     # even if we got access to doc_obj, if no meta stored in docsql.xml, then when 
    340     # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename 
    341     # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self 
    342     if($file =~ m/docsql-(.+?)\.xml$/) { 
    343  
    344     # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing 
    345     # subfolder besides filename, e.g. "dir/docsql-<OID>.xml" 
    346      
    347     # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original 
    348     (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@; 
    349  
    350     $self->{'doc_oid'} = $oid; 
    351     }     
    352  
    353      
    354     # always read docsql.xml, as we then know doc structure, and assoc files are dealt with 
    355     # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db 
    356     return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj 
    357  
    358      
    359 } 
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32541 r32542  
    5252# It's fine: the die() stmts all take place before setting up the super class' begin 
    5353 
     54# TODO Q: about build_mode: how to detect removeold 
    5455# TODO: deal with -removeold and everything? Or type out instructions for user 
    5556 
    5657# TODO Q: what is "group" in GreenstoneXMLPlugout? 
    5758# TODO Q: site_name only exists for GS3. What about GS2? 
     59 
    5860 
    5961my $process_mode_list = 
     
    216218} 
    217219 
    218 # produce files called docsql-<OID>.xml instead of doc.xml 
     220# Produce files called docsql.xml instead of doc.xml 
    219221sub get_doc_xml_filename { 
    220222    my $self = shift (@_); 
    221223    my ($doc_obj) = @_; 
    222224     
    223     my $doc_id = $doc_obj->get_OID(); 
    224     my $filename = $doc_id ? "docsql-".$doc_id.".xml" : "docsql.xml"; 
    225     return $filename; 
    226 } 
    227     
     225    return "docsql.xml"; 
     226} 
     227 
     228# overriding to store doc OID as attribute of top level element: <Archive docoid="oid"> 
     229sub output_xml_header { 
     230    my $self = shift (@_); 
     231    my ($outhandle, $doc_oid) = @_; 
     232 
     233    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n"; 
     234    print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n"; 
     235    print $outhandle "<Archive docoid=\"$doc_oid\">\n"; 
     236} 
     237  
    228238# TODO: check arc-inf.db for whether each entry is to be deleted/indexed/reindexed/been indexed 
    229239sub saveas { 
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneXMLPlugout.pm

    r32536 r32542  
    190190    # only output the header if we have started a new doc 
    191191    if (!$self->is_group() || $self->{'new_doc_dir'}) { 
    192     $self->output_xml_header($outhandler); 
     192    $self->output_xml_header($outhandler, $doc_obj->get_OID()); 
    193193    } 
    194194  
     
    238238sub output_xml_header { 
    239239    my $self = shift (@_); 
    240     my ($outhandle) = @_; 
     240    my ($outhandle, $doc_oid) = @_; 
    241241 
    242242    print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";