- Timestamp:
- 2018-10-25T21:17:02+13:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm
r32541 r32542 1 1 ########################################################################### 2 2 # 3 # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql -<OID>.xml3 # GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml 4 4 # Metadata and/or fulltext are stored in SQL db, the rest may be stored in 5 5 # the docsql .xml files. … … 42 42 # Ask about docsql naming convention adopted to identify OID. Better way? 43 43 # collection names -> table names: it seems hyphens not allowed. Changed to underscores. 44 # - Startup parameters44 # + Startup parameters (except removeold/build_mode) 45 45 # - incremental building: where do we need to add code to delete rows from our sql table after 46 46 # incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta? … … 63 63 # won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is 64 64 65 # TODO: Add public instructions on using this plugin and its plugout: start with installing mysql binary, changing pwd, running the server (and the client against it for checking, basic cmds like create and drop). Then discuss db name, table names (per coll), db cols and col types, and how the plugout and plugin work. 66 # Discuss the plugin/plugout parameters. 67 65 68 sub BEGIN { 66 69 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin'); … … 69 72 # This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin 70 73 # So we won't have a process exp conflict here. 74 # The structure of docsql.xml files is identical to doc.xml and the contents are similar except: 75 # - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating 76 # this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt) 77 # - the root element Archive now has a docoid attribute: <Archive docoid="OID"> 71 78 sub get_default_process_exp { 72 79 my $self = shift (@_); 73 80 74 #return q^(?i)docsql(-\d+)?\.xml$^;75 return q^(?i)docsql(-.+)?\.xml$^;81 return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin 82 #return q^(?i)docsql(-.+)?\.xml$^; # no longer storing the OID embedded in docsql .xml filename 76 83 } 77 84 … … 156 163 } 157 164 165 sub xml_start_tag { 166 my $self = shift(@_); 167 my ($expat, $element) = @_; 168 169 my $outhandle = $self->{'outhandle'}; 170 171 $self->{'element'} = $element; 172 if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element 173 # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_ 174 # contains the tag) 175 176 # Don't access %_{'docoid'} directly: keep getting a warning message to 177 # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap 178 # whereas $_ has the tag info. So we don't want to do $_{'docoid'}. 179 my %attr_hash = %_; # right way, see OAIPlugin.pm 180 $self->{'doc_oid'} = $attr_hash{'docoid'}; 181 print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n" 182 if $self->{'verbosity'} > 1; 183 184 } 185 else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements 186 $self->SUPER::xml_start_tag(@_); 187 } 188 } 158 189 159 190 # TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)? … … 169 200 my $gs_sql = $self->{'gs_sql'}; 170 201 171 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()202 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag() 172 203 print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n" 173 204 if $self->{'verbosity'} > 1; … … 325 356 } 326 357 327 sub read {328 my $self = shift (@_);329 330 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;331 332 # when running buildcol.pl, the filename should match "docsql-<OID>.xml"333 # when running import.pl it will be the original document's filename334 # we only want to read in from db when running buildcol.pl335 336 # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)337 # is created in super::open_document() and is made undef again on super::close_document().338 # Further, can't read it in from doc.xml to work out which OID to query in sql db:339 # even if we got access to doc_obj, if no meta stored in docsql.xml, then when340 # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename341 # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self342 if($file =~ m/docsql-(.+?)\.xml$/) {343 344 # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing345 # subfolder besides filename, e.g. "dir/docsql-<OID>.xml"346 347 # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original348 (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;349 350 $self->{'doc_oid'} = $oid;351 }352 353 354 # always read docsql.xml, as we then know doc structure, and assoc files are dealt with355 # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db356 return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj357 358 359 }
Note:
See TracChangeset
for help on using the changeset viewer.