source: main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm@ 32783

Last change on this file since 32783 was 32783, checked in by kjdon, 5 years ago

adding missing strings and tidying up some mislabelling

File size: 18.8 KB
Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gsmysql;
38
39#use unicode;
40#use Encode;
41
42########################################################################################
43
44# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext
45# is still written out to doc.xml (docsql .xml), that will be processed as usual,
46# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
47# is written out by GreenstoneSQLPlugout into the SQL db).
48
49
50sub BEGIN {
51 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
52}
53
54# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
55# So we won't have a process exp conflict here.
56# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
57# - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
58# this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
59# - the root element Archive now has a docoid attribute: <Archive docoid="OID">
60sub get_default_process_exp {
61 my $self = shift (@_);
62
63 return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
64}
65
66my $process_mode_list =
67 [ { 'name' => "meta_only",
68 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
69 { 'name' => "text_only",
70 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
71 { 'name' => "all",
72 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
73
74my $rollback_on_cancel_list =
75 [ { 'name' => "true",
76 'desc' => "{common.true}" },
77 { 'name' => "false",
78 'desc' => "{common.false}" } ];
79
80# NOTE: If subclassing gsmysql for other supporting databases and if they have different required
81# connection parameters, we can check how WordPlugin, upon detecting Word is installed,
82# dynamically loads Word specific configuration options.
83my $arguments =
84 [ { 'name' => "process_exp",
85 'desc' => "{BaseImporter.process_exp}",
86 'type' => "regexp",
87 'deft' => &get_default_process_exp(),
88 'reqd' => "no" },
89 { 'name' => "process_mode",
90 'desc' => "{GreenstoneSQLPlug.process_mode}",
91 'type' => "enum",
92 'list' => $process_mode_list,
93 'deft' => "all",
94 'reqd' => "no"},
95 { 'name' => "rollback_on_cancel",
96 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}",
97 'type' => "enum",
98 'list' => $rollback_on_cancel_list,
99 'deft' => "false", # better default than true
100 'reqd' => "no",
101 'hiddengli' => "no"},
102 { 'name' => "db_driver",
103 'desc' => "{GreenstoneSQLPlug.db_driver}",
104 'type' => "string",
105 'deft' => "mysql",
106 'reqd' => "yes"},
107 { 'name' => "db_client_user",
108 'desc' => "{GreenstoneSQLPlug.db_client_user}",
109 'type' => "string",
110 'deft' => "root",
111 'reqd' => "yes"},
112 { 'name' => "db_client_pwd",
113 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
114 'type' => "string",
115 'deft' => "",
116 'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd
117 { 'name' => "db_host",
118 'desc' => "{GreenstoneSQLPlug.db_host}",
119 'type' => "string",
120 'deft' => "127.0.0.1", # NOTE: make this int? No default for port, since it's not a required connection param
121 'reqd' => "yes"},
122 { 'name' => "db_port",
123 'desc' => "{GreenstoneSQLPlug.db_port}",
124 'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param
125 'reqd' => "no"}
126 ];
127
128my $options = { 'name' => "GreenstoneSQLPlugin",
129 'desc' => "{GreenstoneSQLPlugin.desc}",
130 'abstract' => "no",
131 'inherits' => "yes",
132 'args' => $arguments };
133
134
135###### Methods called during buildcol and import #######
136
137sub new {
138 my ($class) = shift (@_);
139 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
140 push(@$pluginlist, $class);
141
142 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
143 push(@{$hashArgOptLists->{"OptList"}},$options);
144
145 my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
146
147
148 #return bless $self, $class;
149 $self = bless $self, $class;
150 if ($self->{'info_only'}) {
151 # If running pluginfo, we don't need to go further.
152 return $self;
153 }
154
155 # do anything else that needs to be done here when not pluginfo
156
157 return $self;
158}
159
160# Call init() not begin() because there can be multiple plugin passes and begin() called for
161# each pass (one for doc level and another for section level indexing), whereas init() should
162# be called before any and all passes.
163# This way, we can connect to the SQL database once per buildcol run.
164# Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a
165# singleton db connection, regardless of the number of gsmysql objects instantiated and
166# the number of connect() calls made on them.
167sub init {
168 my ($self) = shift (@_);
169
170 $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
171
172
173 # create gsmysql object.
174 # collection name will be used for naming tables (site name will be used for naming database)
175 my $gs_sql = new gsmysql({
176 'collection_name' => $ENV{'GSDLCOLLECTION'},
177 'verbosity' => $self->{'verbosity'} || 0
178 });
179
180 # if autocommit is set, there's no rollback support
181 my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0;
182
183 # try connecting to the mysql db, die if that fails
184 if(!$gs_sql->connect_to_db({
185 'db_driver' => $self->{'db_driver'},
186 'db_client_user' => $self->{'db_client_user'},
187 'db_client_pwd' => $self->{'db_client_pwd'},
188 'db_host' => $self->{'db_host'},
189 'db_port' => $self->{'db_port'}, # undef by default, can leave as is
190 'autocommit' => $autocommit
191 })
192 )
193 {
194 # This is fatal for the plugout, let's terminate here
195 # PrintError would already have displayed the warning message on connection fail
196 die("Could not connect to db. Can't proceed.\n");
197 }
198
199 my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
200
201 # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet)
202 # Bail if we can't use the database
203 if(!$gs_sql->use_db($db_name)) {
204
205 # This is fatal for the plugout, let's terminate here after disconnecting again
206 # PrintError would already have displayed the warning message on load fail
207 # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
208 die("Could not use db $db_name. Can't proceed.\n");
209 }
210
211
212 # store db handle now that we're connected
213 $self->{'gs_sql'} = $gs_sql;
214}
215
216
217# This method also runs on import.pl if gs_sql has a value.
218# Call deinit() not end() because there can be multiple plugin passes:
219# one for doc level and another for section level indexing
220# and deinit() should be called before all passes
221# This way, we can close the SQL database once per buildcol run.
222# Again, this doesn't matter because we gsmysql the ensures the connection
223# is a singleton connection instance, which connects once and disconnects once per perl process.
224sub deinit {
225 my ($self) = shift (@_);
226
227 if($self->{'gs_sql'}) {
228
229 # Important to call finished():
230 # it will disconnect from db if this is the last gsmysql instance,
231 # and it will commit to db before disconnecting if rollbback_on_cancel turned on
232 $self->{'gs_sql'}->finished();
233
234 # Clear gs_sql (setting key to undef has a different meaning from deleting:
235 # undef makes key still exist but its value is unded whereas delete deletes the key)
236 # So all future use has to make the connection again
237 delete $self->{'gs_sql'};
238 }
239
240 $self->SUPER::deinit(@_);
241}
242
243
244
245###### Methods only called during import.pl #####
246
247# This is called once if removeold is set with import.pl. Most plugins will do
248# nothing but if a plugin does any stuff outside of creating doc obj, then
249# it may need to clear something.
250# In the case of GreenstoneSQL plugs: this is the first time we have a chance
251# to purge the tables of the current collection from the current site's database
252sub remove_all {
253 my $self = shift (@_);
254 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
255
256 $self->SUPER::remove_all(@_);
257
258 print STDERR " Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'});
259
260 # if we're in here, we'd already have run 'use database <site>;' during sub init()
261 # so we can go ahead and delete the collection's tables
262 my $gs_sql = $self->{'gs_sql'};
263 $gs_sql->delete_collection_tables(); # will delete them if they exist
264
265 # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin()
266 my $proc_mode = $self->{'process_mode'};
267 if($proc_mode ne "text_only") {
268 $gs_sql->ensure_meta_table_exists();
269 }
270 if($proc_mode ne "meta_only") {
271 $gs_sql->ensure_fulltxt_table_exists();
272 }
273
274}
275
276# This is called during import.pl per document for docs that have been deleted from the
277# collection. Most plugins will do nothing
278# but if a plugin does any stuff outside of creating doc obj, then it may need
279# to clear something.
280# In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db.
281# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building
282# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm.
283# As well as cases involving reindexing, which are implemented here as delete followed by add.
284sub remove_one {
285 my $self = shift (@_);
286
287 my ($file, $oids, $archivedir) = @_;
288
289 my $rv = $self->SUPER::remove_one(@_);
290
291 print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n";
292
293 #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS (inherited remove_one behaviour) HERE:
294 # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED.
295 # WE CARE ABOUT REMOVING THE DOC_OID OF THAT IMAGE FILE FROM THE SQL DB
296 # SO DON'T RETURN IF CAN'T_PROCESS_THIS_FILE
297
298
299 my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc
300
301 my $proc_mode = $self->{'process_mode'};
302 foreach my $oid (@$oids) {
303 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
304 print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2;
305 $gs_sql->delete_recs_from_metatable_with_docid($oid);
306 }
307 if($proc_mode eq "all" || $proc_mode eq "text_only") {
308 print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2;
309 $gs_sql->delete_recs_from_texttable_with_docid($oid);
310 }
311 }
312 return $rv;
313}
314
315##### Methods called only during buildcol #####
316
317sub xml_start_tag {
318 my $self = shift(@_);
319 my ($expat, $element) = @_;
320
321 my $outhandle = $self->{'outhandle'};
322
323 $self->{'element'} = $element;
324 if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
325 # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
326 # contains the tag)
327
328 # Don't access %_{'docoid'} directly: keep getting a warning message to
329 # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
330 # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
331 my %attr_hash = %_; # right way, see OAIPlugin.pm
332 $self->{'doc_oid'} = $attr_hash{'docoid'};
333 print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
334 if $self->{'verbosity'} > 2;
335
336 }
337 else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
338 $self->SUPER::xml_start_tag(@_);
339 }
340}
341
342# There are multiple passes processing the document (see buildcol's mode parameter description):
343# - compressing the text which may be a dummy pass for lucene/solr, wherein they still want the
344# docobj for different purposes,
345# - the pass(es) for indexing, e.g. doc/didx and section/sidx level passes
346# - and an infodb pass for processing the classifiers. This pass too needs the docobj
347# Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory
348
349# We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase.
350# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
351# the doc_obj in memory is processed (indexed) and then made undef.
352# So we have to work with doc_obj before superclass close_document() is finished.
353sub close_document {
354 my $self = shift(@_);
355
356 my $gs_sql = $self->{'gs_sql'};
357
358 my $outhandle = $self->{'outhandle'};
359 my $doc_obj = $self->{'doc_obj'};
360
361 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
362 my $proc_mode = $self->{'process_mode'};
363
364 # For now, we have access to doc_obj (until just before super::close_document() terminates)
365
366 # OID parsed of docsql.xml file does need to be set on $doc_obj, as noticed in this case:
367 # when a doc in import is renamed, and you do incremental import, it is marked for reindexing
368 # (reindexing is implemented by this plugin as a delete followed by add into the sql db).
369 # In that case, UNLESS you set the OID at this stage, the old deleted doc id (for the old doc
370 # name) continues to exist in the index at the end of incremental rebuilding if you were to
371 # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted
372 # doc oids will still be listed in the index.
373
374 print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n"
375 if $self->{'verbosity'};
376
377 my $build_proc_mode = $self->{'processor'}->get_mode(); # can be "text" as per basebuildproc or infodb
378 if($build_proc_mode =~ m/(delete)$/) {
379 # build_proc_mode could be "(infodb|text)(delete|reindex)"
380 # "...delete" or "...reindex" as per ArchivesInfPlugin
381 # But reindex is implemented as delete for GreenstoneSQLPlugs, so that's all we see here?
382 print STDERR " DOC $oid WAS MARKED FOR DELETION. Won't attempt to retrieve from SQL db.\n" if $self->{'verbosity'};
383 $self->{'doc_obj'}->set_OID($oid); # oid is all we care about for a doc marked for deletion
384 $self->SUPER::close_document(@_); # at the end of this method, doc will have been deleted
385 return; # oid of doc marked for deletion is not in the SQL db, don't bother looking it up
386 }
387
388 # else, doc denoted by oid was not marked for deletion, look up its oid in db and read it into doc obj
389
390 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
391 # read in meta for the collection (i.e. select * from <col>_metadata table
392
393 my $records = $gs_sql->select_from_metatable_matching_docid($oid, $outhandle);
394
395 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
396
397 foreach my $row (@$records) {
398 my ($primary_key, $did, $sid, $metaname, $metaval) = @$row;
399
400 # get rid of the artificial "root" introduced in section id when saving to sql db
401 $sid =~ s@^root@@;
402 $sid = $doc_obj->get_top_section() unless $sid;
403 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
404 if $self->{'verbosity'} > 2;
405
406 # We're only dealing with utf8 data where docobj is concerned
407 # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db
408 $doc_obj->add_utf8_metadata($sid, $metaname, $metaval);
409 }
410
411 # UTF8 debugging, e.g. if we have a with macron in dc.Title
412 ##binmode(STDERR, ":utf8"); # not necessary when calling utf8decomp to print wide characters as unicode codept, avoiding 'wide character' warnings.
413 #my $dcTitle = $doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title");
414 #print STDERR "###### dc.Title: ".&unicode::utf8decomp($dcTitle)."\n";
415 #print STDERR "###### DECODED dc.Title: ".&unicode::utf8decomp(&Encode::decode_utf8($dcTitle))."\n"; # decoding was needed for perl 5.18
416
417 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
418 if $self->{'verbosity'} > 2;
419 }
420
421 if($proc_mode eq "all" || $proc_mode eq "text_only") {
422 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
423
424 my $fulltxt_table = $gs_sql->get_fulltext_table_name();
425
426
427 my $records = $gs_sql->select_from_texttable_matching_docid($oid, $outhandle);
428
429
430 print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
431 if $self->{'verbosity'} > 2;
432
433 foreach my $row (@$records) {
434 my ($primary_key, $did, $sid, $text) = @$row;
435
436 # get rid of the artificial "root" introduced in section id when saving to sql db
437 #$sid =~ s@^root@@;
438 $sid = $doc_obj->get_top_section() if ($sid eq "root");
439 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
440 if $self->{'verbosity'} > 2;
441
442 # We're only dealing with utf8 data where docobj is concerned
443 # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db
444 $doc_obj->add_utf8_textref($sid, \$text);
445 }
446 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
447 if $self->{'verbosity'} > 2;
448 }
449
450
451 # setting OID here instead of before reading from SQL db into docobj, will prevent duplicate values for Identifier
452 # since doc::set_OID() calls doc::set_metadata_element() for metadata that can't occur more than once
453 $self->{'doc_obj'}->set_OID($oid); # may only be necessary if doc was marked for deletion so that SUPER::close_document knows
454 # the oid of marked doc to remove from index
455
456 # done reading into docobj from SQL db
457
458 # don't forget to clean up on close() in superclass
459 # It will get the doc_obj indexed then make it undef
460 $self->SUPER::close_document(@_);
461}
462
463
4641;
Note: See TracBrowser for help on using the repository browser.