source: main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm@ 32541

Last change on this file since 32541 was 32541, checked in by ak19, 4 years ago

Using proper parameters to GreenstoneSQLPlugin/Plugout instead of hardcoded values for params.

File size: 14.5 KB
Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here, in Plugout and in gssql.pm by Dr Bainbridge.
42# Ask about docsql naming convention adopted to identify OID. Better way?
43# collection names -> table names: it seems hyphens not allowed. Changed to underscores.
44# - Startup parameters
45# - incremental building: where do we need to add code to delete rows from our sql table after
46# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
47# - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in?
48# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
49# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
50# - Ask if I can assume that all SQL dbs (not just MySQL) will preserve the order of inserted nodes
51# (sections) which in this case had made it easy to reconstruct the doc_obj in memory in the correct order
52
53# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
54# is still written out to doc.xml (docsql .xml), that will be processed as usual,
55# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
56# is written out by GreenstoneSQLPlugout into the SQL db).
57
58# TODO:
59# no more docoid in docsql .xml filename, set OID as attribute of root element inside docsql.xml file instead
60# and parse it out
61
62# TODO: deal with incremental vs removeold. If docs removed from import folder, then import step
63# won't delete it from archives but buildcol step will. Need to implement this with this database plugin or wherever the actual flow is
64
65sub BEGIN {
66 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
67}
68
69# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
70# So we won't have a process exp conflict here.
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 #return q^(?i)docsql(-\d+)?\.xml$^;
75 return q^(?i)docsql(-.+)?\.xml$^;
76}
77
78my $process_mode_list =
79 [ { 'name' => "meta_only",
80 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
81 { 'name' => "text_only",
82 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
83 { 'name' => "all",
84 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
85
86my $arguments =
87 [ { 'name' => "process_exp",
88 'desc' => "{BaseImporter.process_exp}",
89 'type' => "regexp",
90 'deft' => &get_default_process_exp(),
91 'reqd' => "no" },
92 { 'name' => "process_mode",
93 'desc' => "{GreenstoneSQLPlug.process_mode}",
94 'type' => "enum",
95 'list' => $process_mode_list,
96 'deft' => "all",
97 'reqd' => "no"},
98 { 'name' => "db_driver",
99 'desc' => "{GreenstoneSQLPlug.db_driver}",
100 'type' => "string",
101 'deft' => "mysql",
102 'reqd' => "yes"},
103 { 'name' => "db_client_user",
104 'desc' => "{GreenstoneSQLPlug.db_client_user}",
105 'type' => "string",
106 'deft' => "root",
107 'reqd' => "yes"},
108 { 'name' => "db_client_pwd",
109 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
110 'type' => "string",
111 'deft' => "",
112 'reqd' => "yes"}, # pwd required?
113 { 'name' => "db_host",
114 'desc' => "{GreenstoneSQLPlug.db_host}",
115 'type' => "string",
116 'deft' => "127.0.0.1",
117 'reqd' => "yes"},
118 { 'name' => "db_encoding",
119 'desc' => "{GreenstoneSQLPlug.db_encoding}",
120 'type' => "string",
121 'deft' => "utf8",
122 'reqd' => "yes"}
123 ];
124
125my $options = { 'name' => "GreenstoneSQLPlugin",
126 'desc' => "{GreenstoneSQLPlugin.desc}",
127 'abstract' => "no",
128 'inherits' => "yes",
129 'args' => $arguments };
130
131
132# TODO: For on cancel, add a SIGTERM handler or so to call end()
133# or to explicitly call gs_sql->close_connection if $gs_sql def
134
135sub new {
136 my ($class) = shift (@_);
137 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
138 push(@$pluginlist, $class);
139
140 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
141 push(@{$hashArgOptLists->{"OptList"}},$options);
142
143 my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
144
145
146 #return bless $self, $class;
147 $self = bless $self, $class;
148 if ($self->{'info_only'}) {
149 # If running pluginfo, we don't need to go further.
150 return $self;
151 }
152
153 # do anything else that needs to be done here when not pluginfo
154
155 return $self;
156}
157
158
159# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
160
161# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
162# the doc_obj in memory is processed (indexed) and then made undef.
163# So we have to work with doc_obj before superclass close_document() is finished.
164sub close_document {
165 my $self = shift(@_);
166
167 my $outhandle = $self->{'outhandle'};
168 my $doc_obj = $self->{'doc_obj'};
169 my $gs_sql = $self->{'gs_sql'};
170
171 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()
172 print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n"
173 if $self->{'verbosity'} > 1;
174
175
176 # For now, we have access to doc_obj (until just before super::close_document() terminates)
177
178 $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index?
179
180 # checking that complicated looking method set_OID() hasn't modified oid
181 if($oid ne $self->{'doc_obj'}->get_OID()) {
182 print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n";
183 }
184
185
186 # TODO: This function is called on a per doc.xml file basis
187 # but we can process all docs of a collection in one go when dealing with the SQL tables for
188 # the collection. How and where should we read in the collection tables then?
189 # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl
190 # and if that file is detected, then MySQLPlugin::read() is passed in that file during
191 # buildcol.pl. And that file will trigger reading the 2 tables for the collection???
192 my $proc_mode = $self->{'process_mode'};
193 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
194 # read in meta for the collection (i.e. select * from <col>_metadata table
195
196 my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
197 print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
198
199 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
200 # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
201 while( my @row = $sth->fetchrow_array() ) {
202 #print $outhandle "row: @row\n";
203 my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
204
205 # get rid of the artificial "root" introduced in section id when saving to sql db
206 $sid =~ s@^root@@;
207 $sid = $doc_obj->get_top_section() unless $sid;
208 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
209 if $self->{'verbosity'} > 1;
210
211 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
212 $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
213 }
214 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
215 if $self->{'verbosity'} > 1;
216 }
217
218 if($proc_mode eq "all" || $proc_mode eq "text_only") {
219 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
220
221 my $fulltxt_table = $gs_sql->get_fulltext_table_name();
222
223
224 my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
225 print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
226
227 print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
228 if $self->{'verbosity'} > 1;
229 while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {
230
231 # get rid of the artificial "root" introduced in section id when saving to sql db
232 #$sid =~ s@^root@@;
233 $sid = $doc_obj->get_top_section() if ($sid eq "root");
234 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
235 if $self->{'verbosity'} > 1;
236
237 # TODO - pass by ref?
238 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
239 $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
240 }
241 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
242 if $self->{'verbosity'} > 1;
243 }
244
245
246 # don't forget to clean up on close() in superclass
247 # It will get the doc_obj indexed then make it undef
248 $self->SUPER::close_document(@_);
249}
250
251
252# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl
253# call init() not begin() because there can be multiple plugin passes
254# and init() should be called before all passes:
255# one for doc level and another for section level indexing
256# This way, we can connect to the SQL database once per buildcol run.
257sub init {
258 my ($self) = shift (@_);
259# print STDERR "@@@@@@@@@@ INIT CALLED\n";
260
261 $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
262
263 ####################
264# print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
265# print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
266
267# print STDERR "@@@@ db_pwd: " . $self->{'db_client_pwd'} . "\n";
268# print STDERR "@@@@ user: " . $self->{'db_client_user'} . "\n";
269# print STDERR "@@@@ db_host: " . $self->{'db_host'} . "\n";
270# print STDERR "@@@@ db_enc: " . $self->{'db_encoding'} . "\n";
271# print STDERR "@@@@ db_driver: " . $self->{'db_driver'} . "\n";
272 ####################
273
274 my $gs_sql = new gssql({
275 'collection_name' => $ENV{'GSDLCOLLECTION'},
276 'db_encoding' => $self->{'db_encoding'}
277 }
278 );
279
280 # try connecting to the mysql db, if that fails it will die
281 if(!$gs_sql->connect_to_db({
282 'db_driver' => $self->{'db_driver'},
283 'db_client_user' => $self->{'db_client_user'},
284 'db_client_pwd' => $self->{'db_client_pwd'},
285 'db_host' => $self->{'db_host'}
286 })
287 )
288 {
289 # This is fatal for the plugout, let's terminate here
290 # PrintError would already have displayed the warning message on connection fail
291 die("Could not connect to db. Can't proceed.\n");
292 }
293
294 my $db_name = $self->{'site_name'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
295 #my $build_mode = $self->{'build_mode'} || "removeold";
296
297 # the db and its tables should exist. Attempt to use the db:
298 if(!$gs_sql->use_db($db_name)) {
299
300 # This is fatal for the plugout, let's terminate here after disconnecting again
301 # PrintError would already have displayed the warning message on load fail
302 $gs_sql->disconnect_from_db()
303 || warn("Unable to disconnect from database.\n");
304 die("Could not use db $db_name. Can't proceed.\n");
305 }
306
307 # store db handle now that we're connected
308 $self->{'gs_sql'} = $gs_sql;
309
310}
311
312# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
313# Call deinit() not end() because there can be multiple plugin passes:
314# one for doc level and another for section level indexing
315# and deinit() should be called before all passes
316# This way, we can close the SQL database once per buildcol run.
317sub deinit {
318 my ($self) = shift (@_);
319 if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
320 $self->{'gs_sql'}->disconnect_from_db()
321 || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
322 }
323# print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
324 $self->SUPER::deinit(@_);
325}
326
327sub read {
328 my $self = shift (@_);
329
330 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
331
332 # when running buildcol.pl, the filename should match "docsql-<OID>.xml"
333 # when running import.pl it will be the original document's filename
334 # we only want to read in from db when running buildcol.pl
335
336 # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)
337 # is created in super::open_document() and is made undef again on super::close_document().
338 # Further, can't read it in from doc.xml to work out which OID to query in sql db:
339 # even if we got access to doc_obj, if no meta stored in docsql.xml, then when
340 # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename
341 # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self
342 if($file =~ m/docsql-(.+?)\.xml$/) {
343
344 # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing
345 # subfolder besides filename, e.g. "dir/docsql-<OID>.xml"
346
347 # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original
348 (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;
349
350 $self->{'doc_oid'} = $oid;
351 }
352
353
354 # always read docsql.xml, as we then know doc structure, and assoc files are dealt with
355 # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db
356 return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj
357
358
359}
Note: See TracBrowser for help on using the repository browser.