source: main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm@ 32595

Last change on this file since 32595 was 32595, checked in by ak19, 5 years ago

Major tidying up: last remaining debug statements, lots of comments, removed TODO lists.

File size: 17.0 KB
RevLine 
[32536]1###########################################################################
2#
[32542]3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
[32536]4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
[32592]37use gsmysql;
[32536]38
39
40
[32555]41########################################################################################
42
[32595]43# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext
[32536]44# is still written out to doc.xml (docsql .xml), that will be processed as usual,
45# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
46# is written out by GreenstoneSQLPlugout into the SQL db).
47
48
49sub BEGIN {
50 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
51}
52
53# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
54# So we won't have a process exp conflict here.
[32542]55# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
56# - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
57# this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
58# - the root element Archive now has a docoid attribute: <Archive docoid="OID">
[32536]59sub get_default_process_exp {
60 my $self = shift (@_);
61
[32542]62 return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
[32536]63}
64
65my $process_mode_list =
66 [ { 'name' => "meta_only",
[32537]67 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
[32536]68 { 'name' => "text_only",
[32537]69 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
[32536]70 { 'name' => "all",
[32537]71 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
[32536]72
[32582]73my $rollback_on_cancel_list =
74 [ { 'name' => "true",
75 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}" },
76 { 'name' => "false",
77 'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ];
78
[32595]79# NOTE: If subclassing gsmysql for other supporting databases and if they have different required
[32591]80# connection parameters, we can check how WordPlugin, upon detecting Word is installed,
81# dynamically loads Word specific configuration options.
[32536]82my $arguments =
83 [ { 'name' => "process_exp",
84 'desc' => "{BaseImporter.process_exp}",
85 'type' => "regexp",
86 'deft' => &get_default_process_exp(),
87 'reqd' => "no" },
[32541]88 { 'name' => "process_mode",
89 'desc' => "{GreenstoneSQLPlug.process_mode}",
90 'type' => "enum",
91 'list' => $process_mode_list,
92 'deft' => "all",
93 'reqd' => "no"},
[32582]94 { 'name' => "rollback_on_cancel",
95 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}",
96 'type' => "enum",
97 'list' => $rollback_on_cancel_list,
[32591]98 'deft' => "false", # better default than true
[32582]99 'reqd' => "no",
100 'hiddengli' => "no"},
[32541]101 { 'name' => "db_driver",
102 'desc' => "{GreenstoneSQLPlug.db_driver}",
103 'type' => "string",
104 'deft' => "mysql",
105 'reqd' => "yes"},
106 { 'name' => "db_client_user",
107 'desc' => "{GreenstoneSQLPlug.db_client_user}",
108 'type' => "string",
109 'deft' => "root",
110 'reqd' => "yes"},
111 { 'name' => "db_client_pwd",
112 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
113 'type' => "string",
114 'deft' => "",
[32591]115 'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd
[32541]116 { 'name' => "db_host",
117 'desc' => "{GreenstoneSQLPlug.db_host}",
118 'type' => "string",
[32591]119 'deft' => "127.0.0.1", # NOTE: make this int? No default for port, since it's not a required connection param
[32541]120 'reqd' => "yes"},
[32589]121 { 'name' => "db_port",
122 'desc' => "{GreenstoneSQLPlug.db_port}",
123 'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param
124 'reqd' => "no"}
[32536]125 ];
126
127my $options = { 'name' => "GreenstoneSQLPlugin",
128 'desc' => "{GreenstoneSQLPlugin.desc}",
129 'abstract' => "no",
130 'inherits' => "yes",
131 'args' => $arguments };
132
133
[32583]134###### Methods called during buildcol and import #######
[32536]135
136sub new {
137 my ($class) = shift (@_);
138 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139 push(@$pluginlist, $class);
140
141 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142 push(@{$hashArgOptLists->{"OptList"}},$options);
143
144 my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
145
146
147 #return bless $self, $class;
148 $self = bless $self, $class;
149 if ($self->{'info_only'}) {
150 # If running pluginfo, we don't need to go further.
151 return $self;
152 }
153
154 # do anything else that needs to be done here when not pluginfo
155
156 return $self;
157}
158
[32583]159# Call init() not begin() because there can be multiple plugin passes and begin() called for
160# each pass (one for doc level and another for section level indexing), whereas init() should
161# be called before any and all passes.
162# This way, we can connect to the SQL database once per buildcol run.
[32595]163# Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a
164# singleton db connection, regardless of the number of gsmysql objects instantiated and
165# the number of connect() calls made on them.
[32583]166sub init {
167 my ($self) = shift (@_);
168
169 $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
170
171
[32592]172 # create gsmysql object.
[32583]173 # collection name will be used for naming tables (site name will be used for naming database)
[32592]174 my $gs_sql = new gsmysql({
[32583]175 'collection_name' => $ENV{'GSDLCOLLECTION'},
176 'verbosity' => $self->{'verbosity'} || 0
177 });
178
179 # if autocommit is set, there's no rollback support
180 my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0;
181
182 # try connecting to the mysql db, die if that fails
183 if(!$gs_sql->connect_to_db({
184 'db_driver' => $self->{'db_driver'},
185 'db_client_user' => $self->{'db_client_user'},
186 'db_client_pwd' => $self->{'db_client_pwd'},
187 'db_host' => $self->{'db_host'},
[32589]188 'db_port' => $self->{'db_port'}, # undef by default, can leave as is
[32583]189 'autocommit' => $autocommit
190 })
191 )
192 {
193 # This is fatal for the plugout, let's terminate here
194 # PrintError would already have displayed the warning message on connection fail
195 die("Could not connect to db. Can't proceed.\n");
196 }
197
[32586]198 my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
[32583]199
200 # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet)
201 # Bail if we can't use the database
202 if(!$gs_sql->use_db($db_name)) {
203
204 # This is fatal for the plugout, let's terminate here after disconnecting again
205 # PrintError would already have displayed the warning message on load fail
[32592]206 # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
[32583]207 die("Could not use db $db_name. Can't proceed.\n");
208 }
209
210
211 # store db handle now that we're connected
212 $self->{'gs_sql'} = $gs_sql;
213}
214
215
[32595]216# This method also runs on import.pl if gs_sql has a value.
[32583]217# Call deinit() not end() because there can be multiple plugin passes:
218# one for doc level and another for section level indexing
219# and deinit() should be called before all passes
220# This way, we can close the SQL database once per buildcol run.
[32595]221# Again, this doesn't matter because we gsmysql the ensures the connection
222# is a singleton connection instance, which connects once and disconnects once per perl process.
[32583]223sub deinit {
224 my ($self) = shift (@_);
225
226 if($self->{'gs_sql'}) {
227
228 # Important to call finished():
[32592]229 # it will disconnect from db if this is the last gsmysql instance,
[32583]230 # and it will commit to db before disconnecting if rollbback_on_cancel turned on
231 $self->{'gs_sql'}->finished();
232
233 # Clear gs_sql (setting key to undef has a different meaning from deleting:
234 # undef makes key still exist but its value is unded whereas delete deletes the key)
235 # So all future use has to make the connection again
236 delete $self->{'gs_sql'};
237 }
238
239 $self->SUPER::deinit(@_);
240}
241
242
243
244###### Methods only called during import.pl #####
245
[32563]246# This is called once if removeold is set with import.pl. Most plugins will do
247# nothing but if a plugin does any stuff outside of creating doc obj, then
248# it may need to clear something.
249# In the case of GreenstoneSQL plugs: this is the first time we have a chance
250# to purge the tables of the current collection from the current site's database
251sub remove_all {
252 my $self = shift (@_);
253 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
254
[32577]255 $self->SUPER::remove_all(@_);
256
[32563]257 print STDERR " Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'});
258
[32586]259 # if we're in here, we'd already have run 'use database <site>;' during sub init()
[32563]260 # so we can go ahead and delete the collection's tables
261 my $gs_sql = $self->{'gs_sql'};
262 $gs_sql->delete_collection_tables(); # will delete them if they exist
263
264 # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin()
265 my $proc_mode = $self->{'process_mode'};
266 if($proc_mode ne "text_only") {
267 $gs_sql->ensure_meta_table_exists();
268 }
269 if($proc_mode ne "meta_only") {
270 $gs_sql->ensure_fulltxt_table_exists();
271 }
[32582]272
[32563]273}
274
[32582]275# This is called during import.pl per document for docs that have been deleted from the
[32563]276# collection. Most plugins will do nothing
277# but if a plugin does any stuff outside of creating doc obj, then it may need
278# to clear something.
[32595]279# In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db.
[32563]280# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building
[32595]281# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm.
282# As well as cases involving reindexing, which are implemented here as delete followed by add.
[32563]283sub remove_one {
284 my $self = shift (@_);
285
286 my ($file, $oids, $archivedir) = @_;
287
[32570]288 my $rv = $self->SUPER::remove_one(@_);
289
[32563]290 print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n";
291
[32570]292 #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS (inherited remove_one behaviour) HERE:
[32563]293 # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED.
[32578]294 # WE CARE ABOUT REMOVING THE DOC_OID OF THAT IMAGE FILE FROM THE SQL DB
[32570]295 # SO DON'T RETURN IF CAN'T_PROCESS_THIS_FILE
296
[32563]297
298 my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc
299
300 my $proc_mode = $self->{'process_mode'};
301 foreach my $oid (@$oids) {
302 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
303 print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2;
304 $gs_sql->delete_recs_from_metatable_with_docid($oid);
305 }
306 if($proc_mode eq "all" || $proc_mode eq "text_only") {
307 print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2;
308 $gs_sql->delete_recs_from_texttable_with_docid($oid);
309 }
310 }
[32570]311 return $rv;
[32563]312}
313
[32583]314##### Methods called only during buildcol #####
[32563]315
[32542]316sub xml_start_tag {
317 my $self = shift(@_);
318 my ($expat, $element) = @_;
[32536]319
[32542]320 my $outhandle = $self->{'outhandle'};
321
322 $self->{'element'} = $element;
323 if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
324 # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
325 # contains the tag)
326
327 # Don't access %_{'docoid'} directly: keep getting a warning message to
328 # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
329 # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
330 my %attr_hash = %_; # right way, see OAIPlugin.pm
[32555]331 $self->{'doc_oid'} = $attr_hash{'docoid'};
[32542]332 print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
[32544]333 if $self->{'verbosity'} > 2;
[32542]334
335 }
336 else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
337 $self->SUPER::xml_start_tag(@_);
338 }
339}
340
[32591]341# There are multiple passes processing the document (see buildcol's mode parameter description):
342# - compressing the text which may be a dummy pass for lucene/solr, wherein they still want the
343# docobj for different purposes,
344# - the pass(es) for indexing, e.g. doc/didx and section/sidx level passes
345# - and an infodb pass for processing the classifiers. This pass too needs the docobj
346# Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory
[32536]347
[32595]348# We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase.
[32536]349# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
350# the doc_obj in memory is processed (indexed) and then made undef.
351# So we have to work with doc_obj before superclass close_document() is finished.
352sub close_document {
353 my $self = shift(@_);
[32555]354
[32563]355 my $gs_sql = $self->{'gs_sql'};
[32555]356
[32536]357 my $outhandle = $self->{'outhandle'};
[32544]358 my $doc_obj = $self->{'doc_obj'};
[32536]359
[32542]360 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
[32555]361 my $proc_mode = $self->{'process_mode'};
362
[32538]363 # For now, we have access to doc_obj (until just before super::close_document() terminates)
[32563]364
[32570]365 # OID parsed of docsql.xml file does need to be set on $doc_obj, as noticed in this case:
366 # when a doc in import is renamed, and you do incremental import, it is marked for reindexing
367 # (reindexing is implemented by this plugin as a delete followed by add into the sql db).
368 # In that case, UNLESS you set the OID at this stage, the old deleted doc id (for the old doc
369 # name) continues to exist in the index at the end of incremental rebuilding if you were to
370 # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted
371 # doc oids will still be listed in the index.
372 $self->{'doc_obj'}->set_OID($oid);
373
[32563]374 print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n"
[32584]375 if $self->{'verbosity'};
[32555]376
[32563]377 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
378 # read in meta for the collection (i.e. select * from <col>_metadata table
[32555]379
[32575]380 my $records = $gs_sql->select_from_metatable_matching_docid($oid, $outhandle);
[32555]381
[32563]382 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
[32575]383
384 foreach my $row (@$records) {
385 my ($primary_key, $did, $sid, $metaname, $metaval) = @$row;
[32555]386
[32563]387 # get rid of the artificial "root" introduced in section id when saving to sql db
388 $sid =~ s@^root@@;
389 $sid = $doc_obj->get_top_section() unless $sid;
390 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
[32544]391 if $self->{'verbosity'} > 2;
[32536]392
[32595]393 # We're only dealing with utf8 data where docobj is concerned
394 # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db
[32591]395 $doc_obj->add_utf8_metadata($sid, $metaname, $metaval);
[32536]396 }
[32563]397 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
398 if $self->{'verbosity'} > 2;
399 }
400
401 if($proc_mode eq "all" || $proc_mode eq "text_only") {
402 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
[32536]403
[32563]404 my $fulltxt_table = $gs_sql->get_fulltext_table_name();
[32536]405
[32563]406
[32575]407 my $records = $gs_sql->select_from_texttable_matching_docid($oid, $outhandle);
[32563]408
[32575]409
[32563]410 print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
411 if $self->{'verbosity'} > 2;
[32575]412
413 foreach my $row (@$records) {
414 my ($primary_key, $did, $sid, $text) = @$row;
[32555]415
[32563]416 # get rid of the artificial "root" introduced in section id when saving to sql db
417 #$sid =~ s@^root@@;
418 $sid = $doc_obj->get_top_section() if ($sid eq "root");
419 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
[32544]420 if $self->{'verbosity'} > 2;
[32595]421
422 # We're only dealing with utf8 data where docobj is concerned
423 # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db
[32591]424 $doc_obj->add_utf8_textref($sid, \$text);
[32563]425 }
426 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
427 if $self->{'verbosity'} > 2;
428 }
[32536]429
[32563]430 # done reading into docobj from SQL db
431
[32536]432 # don't forget to clean up on close() in superclass
433 # It will get the doc_obj indexed then make it undef
434 $self->SUPER::close_document(@_);
435}
436
437
[32583]4381;
Note: See TracBrowser for help on using the repository browser.