source: main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm@ 32643

Last change on this file since 32643 was 32643, checked in by ak19, 5 years ago
  1. Previous commit (r32640) reintroduced an earlier bug in attempting to fix another. It reintroduced the bug whereby a reindex operation upon a doc rename (implemented as delete and and) would result in the deleted doc turning up on browsing. Despite being deleted from the SQL db, a ref to its oid remained in the index. This bug had been fixed by a call to doc_obj.set_OID() which presumably helped to identify the OID of any doc marked for deletion after which the indexing part of the process would proceed to delete it. The recent commit had tried to prevent the assignment of 2 OIDs for renamed documents (the deleted oid and the new oid) by selectively calling set_OID(). But this reintroduced the older bug. The solution was to call set_OID at the end, AFTER reading into the doc_obj from the SQL db, which both prevents 2 OIDs for a renamed doc and properly gets the doc deleted. 2. A further modification is reintroducing an improvement that existed in earlier uncommitted attempts of the GS SQL Plugin. When a doc is marked for deletion, its oid didn't exist in the MySQL db after import.pl, yet during buildcol.pl the code still attempted to read all the records for any and all docoids, including those marked for deletion, back in from the MySQL db. Now the code just does doc_obj.set_OID() if an oid is for a doc marked for deletion (as required to get the doc actually deleted from the index) and then calls the super class method to let the indexing part process the doc to get it deleted and then returns, skipping attempting to read in info on that oid from the SQL db when nothing exists for it. For any non-deleted oid, the code of course continues to read in the entries from the MySQL db for that oid to reconstruct the doc_object.
File size: 18.8 KB
Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gsmysql;
38
39#use unicode;
40#use Encode;
41
42########################################################################################
43
44# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext
45# is still written out to doc.xml (docsql .xml), that will be processed as usual,
46# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
47# is written out by GreenstoneSQLPlugout into the SQL db).
48
49
50sub BEGIN {
51 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
52}
53
54# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
55# So we won't have a process exp conflict here.
56# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
57# - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
58# this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
59# - the root element Archive now has a docoid attribute: <Archive docoid="OID">
60sub get_default_process_exp {
61 my $self = shift (@_);
62
63 return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
64}
65
66my $process_mode_list =
67 [ { 'name' => "meta_only",
68 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
69 { 'name' => "text_only",
70 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
71 { 'name' => "all",
72 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
73
74my $rollback_on_cancel_list =
75 [ { 'name' => "true",
76 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}" },
77 { 'name' => "false",
78 'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ];
79
80# NOTE: If subclassing gsmysql for other supporting databases and if they have different required
81# connection parameters, we can check how WordPlugin, upon detecting Word is installed,
82# dynamically loads Word specific configuration options.
83my $arguments =
84 [ { 'name' => "process_exp",
85 'desc' => "{BaseImporter.process_exp}",
86 'type' => "regexp",
87 'deft' => &get_default_process_exp(),
88 'reqd' => "no" },
89 { 'name' => "process_mode",
90 'desc' => "{GreenstoneSQLPlug.process_mode}",
91 'type' => "enum",
92 'list' => $process_mode_list,
93 'deft' => "all",
94 'reqd' => "no"},
95 { 'name' => "rollback_on_cancel",
96 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}",
97 'type' => "enum",
98 'list' => $rollback_on_cancel_list,
99 'deft' => "false", # better default than true
100 'reqd' => "no",
101 'hiddengli' => "no"},
102 { 'name' => "db_driver",
103 'desc' => "{GreenstoneSQLPlug.db_driver}",
104 'type' => "string",
105 'deft' => "mysql",
106 'reqd' => "yes"},
107 { 'name' => "db_client_user",
108 'desc' => "{GreenstoneSQLPlug.db_client_user}",
109 'type' => "string",
110 'deft' => "root",
111 'reqd' => "yes"},
112 { 'name' => "db_client_pwd",
113 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
114 'type' => "string",
115 'deft' => "",
116 'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd
117 { 'name' => "db_host",
118 'desc' => "{GreenstoneSQLPlug.db_host}",
119 'type' => "string",
120 'deft' => "127.0.0.1", # NOTE: make this int? No default for port, since it's not a required connection param
121 'reqd' => "yes"},
122 { 'name' => "db_port",
123 'desc' => "{GreenstoneSQLPlug.db_port}",
124 'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param
125 'reqd' => "no"}
126 ];
127
128my $options = { 'name' => "GreenstoneSQLPlugin",
129 'desc' => "{GreenstoneSQLPlugin.desc}",
130 'abstract' => "no",
131 'inherits' => "yes",
132 'args' => $arguments };
133
134
135###### Methods called during buildcol and import #######
136
137sub new {
138 my ($class) = shift (@_);
139 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140 push(@$pluginlist, $class);
141
142 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143 push(@{$hashArgOptLists->{"OptList"}},$options);
144
145 my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
146
147
148 #return bless $self, $class;
149 $self = bless $self, $class;
150 if ($self->{'info_only'}) {
151 # If running pluginfo, we don't need to go further.
152 return $self;
153 }
154
155 # do anything else that needs to be done here when not pluginfo
156
157 return $self;
158}
159
160# Call init() not begin() because there can be multiple plugin passes and begin() called for
161# each pass (one for doc level and another for section level indexing), whereas init() should
162# be called before any and all passes.
163# This way, we can connect to the SQL database once per buildcol run.
164# Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a
165# singleton db connection, regardless of the number of gsmysql objects instantiated and
166# the number of connect() calls made on them.
167sub init {
168 my ($self) = shift (@_);
169
170 $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
171
172
173 # create gsmysql object.
174 # collection name will be used for naming tables (site name will be used for naming database)
175 my $gs_sql = new gsmysql({
176 'collection_name' => $ENV{'GSDLCOLLECTION'},
177 'verbosity' => $self->{'verbosity'} || 0
178 });
179
180 # if autocommit is set, there's no rollback support
181 my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0;
182
183 # try connecting to the mysql db, die if that fails
184 if(!$gs_sql->connect_to_db({
185 'db_driver' => $self->{'db_driver'},
186 'db_client_user' => $self->{'db_client_user'},
187 'db_client_pwd' => $self->{'db_client_pwd'},
188 'db_host' => $self->{'db_host'},
189 'db_port' => $self->{'db_port'}, # undef by default, can leave as is
190 'autocommit' => $autocommit
191 })
192 )
193 {
194 # This is fatal for the plugout, let's terminate here
195 # PrintError would already have displayed the warning message on connection fail
196 die("Could not connect to db. Can't proceed.\n");
197 }
198
199 my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
200
201 # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet)
202 # Bail if we can't use the database
203 if(!$gs_sql->use_db($db_name)) {
204
205 # This is fatal for the plugout, let's terminate here after disconnecting again
206 # PrintError would already have displayed the warning message on load fail
207 # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
208 die("Could not use db $db_name. Can't proceed.\n");
209 }
210
211
212 # store db handle now that we're connected
213 $self->{'gs_sql'} = $gs_sql;
214}
215
216
217# This method also runs on import.pl if gs_sql has a value.
218# Call deinit() not end() because there can be multiple plugin passes:
219# one for doc level and another for section level indexing
220# and deinit() should be called before all passes
221# This way, we can close the SQL database once per buildcol run.
222# Again, this doesn't matter because we gsmysql the ensures the connection
223# is a singleton connection instance, which connects once and disconnects once per perl process.
224sub deinit {
225 my ($self) = shift (@_);
226
227 if($self->{'gs_sql'}) {
228
229 # Important to call finished():
230 # it will disconnect from db if this is the last gsmysql instance,
231 # and it will commit to db before disconnecting if rollbback_on_cancel turned on
232 $self->{'gs_sql'}->finished();
233
234 # Clear gs_sql (setting key to undef has a different meaning from deleting:
235 # undef makes key still exist but its value is unded whereas delete deletes the key)
236 # So all future use has to make the connection again
237 delete $self->{'gs_sql'};
238 }
239
240 $self->SUPER::deinit(@_);
241}
242
243
244
245###### Methods only called during import.pl #####
246
247# This is called once if removeold is set with import.pl. Most plugins will do
248# nothing but if a plugin does any stuff outside of creating doc obj, then
249# it may need to clear something.
250# In the case of GreenstoneSQL plugs: this is the first time we have a chance
251# to purge the tables of the current collection from the current site's database
252sub remove_all {
253 my $self = shift (@_);
254 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
255
256 $self->SUPER::remove_all(@_);
257
258 print STDERR " Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'});
259
260 # if we're in here, we'd already have run 'use database <site>;' during sub init()
261 # so we can go ahead and delete the collection's tables
262 my $gs_sql = $self->{'gs_sql'};
263 $gs_sql->delete_collection_tables(); # will delete them if they exist
264
265 # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin()
266 my $proc_mode = $self->{'process_mode'};
267 if($proc_mode ne "text_only") {
268 $gs_sql->ensure_meta_table_exists();
269 }
270 if($proc_mode ne "meta_only") {
271 $gs_sql->ensure_fulltxt_table_exists();
272 }
273
274}
275
276# This is called during import.pl per document for docs that have been deleted from the
277# collection. Most plugins will do nothing
278# but if a plugin does any stuff outside of creating doc obj, then it may need
279# to clear something.
280# In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db.
281# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building
282# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm.
283# As well as cases involving reindexing, which are implemented here as delete followed by add.
284sub remove_one {
285 my $self = shift (@_);
286
287 my ($file, $oids, $archivedir) = @_;
288
289 my $rv = $self->SUPER::remove_one(@_);
290
291 print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n";
292
293 #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS (inherited remove_one behaviour) HERE:
294 # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED.
295 # WE CARE ABOUT REMOVING THE DOC_OID OF THAT IMAGE FILE FROM THE SQL DB
296 # SO DON'T RETURN IF CAN'T_PROCESS_THIS_FILE
297
298
299 my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc
300
301 my $proc_mode = $self->{'process_mode'};
302 foreach my $oid (@$oids) {
303 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
304 print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2;
305 $gs_sql->delete_recs_from_metatable_with_docid($oid);
306 }
307 if($proc_mode eq "all" || $proc_mode eq "text_only") {
308 print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2;
309 $gs_sql->delete_recs_from_texttable_with_docid($oid);
310 }
311 }
312 return $rv;
313}
314
315##### Methods called only during buildcol #####
316
317sub xml_start_tag {
318 my $self = shift(@_);
319 my ($expat, $element) = @_;
320
321 my $outhandle = $self->{'outhandle'};
322
323 $self->{'element'} = $element;
324 if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
325 # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
326 # contains the tag)
327
328 # Don't access %_{'docoid'} directly: keep getting a warning message to
329 # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
330 # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
331 my %attr_hash = %_; # right way, see OAIPlugin.pm
332 $self->{'doc_oid'} = $attr_hash{'docoid'};
333 print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
334 if $self->{'verbosity'} > 2;
335
336 }
337 else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
338 $self->SUPER::xml_start_tag(@_);
339 }
340}
341
342# There are multiple passes processing the document (see buildcol's mode parameter description):
343# - compressing the text which may be a dummy pass for lucene/solr, wherein they still want the
344# docobj for different purposes,
345# - the pass(es) for indexing, e.g. doc/didx and section/sidx level passes
346# - and an infodb pass for processing the classifiers. This pass too needs the docobj
347# Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory
348
349# We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase.
350# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
351# the doc_obj in memory is processed (indexed) and then made undef.
352# So we have to work with doc_obj before superclass close_document() is finished.
353sub close_document {
354 my $self = shift(@_);
355
356 my $gs_sql = $self->{'gs_sql'};
357
358 my $outhandle = $self->{'outhandle'};
359 my $doc_obj = $self->{'doc_obj'};
360
361 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
362 my $proc_mode = $self->{'process_mode'};
363
364 # For now, we have access to doc_obj (until just before super::close_document() terminates)
365
366 # OID parsed of docsql.xml file does need to be set on $doc_obj, as noticed in this case:
367 # when a doc in import is renamed, and you do incremental import, it is marked for reindexing
368 # (reindexing is implemented by this plugin as a delete followed by add into the sql db).
369 # In that case, UNLESS you set the OID at this stage, the old deleted doc id (for the old doc
370 # name) continues to exist in the index at the end of incremental rebuilding if you were to
371 # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted
372 # doc oids will still be listed in the index.
373
374 print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n"
375 if $self->{'verbosity'};
376
377 my $build_proc_mode = $self->{'processor'}->get_mode(); # can be "text" as per basebuildproc or infodb
378 if($build_proc_mode =~ m/(delete)$/) {
379 # build_proc_mode could be "(infodb|text)(delete|reindex)"
380 # "...delete" or "...reindex" as per ArchivesInfPlugin
381 # But reindex is implemented as delete for GreenstoneSQLPlugs, so that's all we see here?
382 print STDERR " DOC $oid WAS MARKED FOR DELETION. Won't attempt to retrieve from SQL db.\n" if $self->{'verbosity'};
383 $self->{'doc_obj'}->set_OID($oid); # oid is all we care about for a doc marked for deletion
384 $self->SUPER::close_document(@_); # at the end of this method, doc will have been deleted
385 return; # oid of doc marked for deletion is not in the SQL db, don't bother looking it up
386 }
387
388 # else, doc denoted by oid was not marked for deletion, look up its oid in db and read it into doc obj
389
390 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
391 # read in meta for the collection (i.e. select * from <col>_metadata table
392
393 my $records = $gs_sql->select_from_metatable_matching_docid($oid, $outhandle);
394
395 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
396
397 foreach my $row (@$records) {
398 my ($primary_key, $did, $sid, $metaname, $metaval) = @$row;
399
400 # get rid of the artificial "root" introduced in section id when saving to sql db
401 $sid =~ s@^root@@;
402 $sid = $doc_obj->get_top_section() unless $sid;
403 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
404 if $self->{'verbosity'} > 2;
405
406 # We're only dealing with utf8 data where docobj is concerned
407 # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db
408 $doc_obj->add_utf8_metadata($sid, $metaname, $metaval);
409 }
410
411 # UTF8 debugging, e.g. if we have a with macron in dc.Title
412 ##binmode(STDERR, ":utf8"); # not necessary when calling utf8decomp to print wide characters as unicode codept, avoiding 'wide character' warnings.
413 #my $dcTitle = $doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title");
414 #print STDERR "###### dc.Title: ".&unicode::utf8decomp($dcTitle)."\n";
415 #print STDERR "###### DECODED dc.Title: ".&unicode::utf8decomp(&Encode::decode_utf8($dcTitle))."\n"; # decoding was needed for perl 5.18
416
417 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
418 if $self->{'verbosity'} > 2;
419 }
420
421 if($proc_mode eq "all" || $proc_mode eq "text_only") {
422 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
423
424 my $fulltxt_table = $gs_sql->get_fulltext_table_name();
425
426
427 my $records = $gs_sql->select_from_texttable_matching_docid($oid, $outhandle);
428
429
430 print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
431 if $self->{'verbosity'} > 2;
432
433 foreach my $row (@$records) {
434 my ($primary_key, $did, $sid, $text) = @$row;
435
436 # get rid of the artificial "root" introduced in section id when saving to sql db
437 #$sid =~ s@^root@@;
438 $sid = $doc_obj->get_top_section() if ($sid eq "root");
439 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
440 if $self->{'verbosity'} > 2;
441
442 # We're only dealing with utf8 data where docobj is concerned
443 # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db
444 $doc_obj->add_utf8_textref($sid, \$text);
445 }
446 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
447 if $self->{'verbosity'} > 2;
448 }
449
450
451 # setting OID here instead of before reading from SQL db into docobj, will prevent duplicate values for Identifier
452 # since doc::set_OID() calls doc::set_metadata_element() for metadata that can't occur more than once
453 $self->{'doc_obj'}->set_OID($oid); # may only be necessary if doc was marked for deletion so that SUPER::close_document knows
454 # the oid of marked doc to remove from index
455
456 # done reading into docobj from SQL db
457
458 # don't forget to clean up on close() in superclass
459 # It will get the doc_obj indexed then make it undef
460 $self->SUPER::close_document(@_);
461}
462
463
4641;
Note: See TracBrowser for help on using the repository browser.