source: main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm@ 32536

Last change on this file since 32536 was 32536, checked in by ak19, 6 years ago

First commit to do with reading back in from the SQL DB. This commit introduces the new GreenstoneSQLPlugin for this purpose, which should ideally only be used during buildcol (but its init(), deinit() and read() methods are also called on import.pl). The new plugin works with GreenstoneSQLPlugout which wrote meta and txt to the SQL DB. Lots of TODOs and questions still here, some debug statements too. Also have to run some decisions by Dr Bainbridge. There are many hardcoded values which still have to be parameterised (not always completely sure how) and still have to test the 2 cases of sending just meta and just fulltxt to db. Next commit will tidy some things up.

File size: 13.3 KB
Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here and in plugout by Dr Bainbridge. Ask about docsql naming convention adopted
42# to identify OID. Better way?
43# collection names -> table names: hyphens not allowed? Changed to underscores.
44# - Startup parameters
45# - incremental building: where do we need to add code to delete rows from our sql table after
46# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
47# - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in?
48# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
49# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
50
51
52# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
53# is still written out to doc.xml (docsql .xml), that will be processed as usual,
54# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
55# is written out by GreenstoneSQLPlugout into the SQL db).
56
57
58sub BEGIN {
59 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
60}
61
62# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
63# So we won't have a process exp conflict here.
64sub get_default_process_exp {
65 my $self = shift (@_);
66
67 #return q^(?i)docsql(-\d+)?\.xml$^;
68 return q^(?i)docsql(-.+)?\.xml$^;
69}
70
71my $process_mode_list =
72 [ { 'name' => "meta_only",
73 'desc' => "{GreenstoneSQLPlugout.process_mode.meta_only}" },
74 { 'name' => "text_only",
75 'desc' => "{GreenstoneSQLPlugout.process_mode.text_only}" },
76 { 'name' => "all",
77 'desc' => "{GreenstoneSQLPlugout.process_mode.all}" } ];
78
79my $arguments =
80 [ { 'name' => "process_exp",
81 'desc' => "{BaseImporter.process_exp}",
82 'type' => "regexp",
83 'deft' => &get_default_process_exp(),
84 'reqd' => "no" },
85 { 'name' => "process_mode",
86 'desc' => "{GreenstoneSQLPlugout.process_mode}",
87 'type' => "enum",
88 'list' => $process_mode_list,
89 'deft' => "all",
90 'reqd' => "no"}
91 ];
92
93my $options = { 'name' => "GreenstoneSQLPlugin",
94 'desc' => "{GreenstoneSQLPlugin.desc}",
95 'abstract' => "no",
96 'inherits' => "yes",
97 'args' => $arguments };
98
99
100# TODO: For on cancel, add a SIGTERM handler or so to call end()
101# or to explicitly call gs_sql->close_connection if $gs_sql def
102
103sub new {
104 my ($class) = shift (@_);
105 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
106 push(@$pluginlist, $class);
107
108 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
109 push(@{$hashArgOptLists->{"OptList"}},$options);
110
111 my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
112
113
114 #return bless $self, $class;
115 $self = bless $self, $class;
116 if ($self->{'info_only'}) {
117 # If running pluginfo, we don't need to go further.
118 return $self;
119 }
120
121 # do anything else that needs to be done here when not pluginfo
122
123 return $self;
124}
125
126# need to store current section
127#sub xml_start_tag {
128# my $self = shift(@_);
129# $self->SUPER::xml_start_tag(@_);
130# $self->{'stored_section'} = $self->{'section'};
131# $self->{'stored_section_level'} = $self->{'section_level'}
132#}
133
134# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
135
136# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
137# the doc_obj in memory is processed (indexed) and then made undef.
138# So we have to work with doc_obj before superclass close_document() is finished.
139sub close_document {
140 my $self = shift(@_);
141
142 my $outhandle = $self->{'outhandle'};
143 my $doc_obj = $self->{'doc_obj'};
144 my $gs_sql = $self->{'gs_sql'};
145
146 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()
147 print $outhandle "@@@@@@ OID of document (meta|text) to be read in from DB: $oid\n"
148 if $self->{'verbosity'} > 1;
149
150# print STDERR "@@@@ section: |" . $self->{'stored_section'} . "|\n";
151# print STDERR "@@@@ section_level: " . $self->{'stored_section_level'} . "\n";
152
153 # for now, we have access to doc_obj (until just before super::close_document() terminates)
154 #$self->{'doc_obj'}->set_OID($oid);
155
156 # TODO: This function is called on a per doc.xml file basis
157 # but we can process all docs of a collection in one go when dealing with the SQL tables for
158 # the collection. How and where should we read in the collection tables then?
159 # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl
160 # and if that file is detected, then MySQLPlugin::read() is passed in that file during
161 # buildcol.pl. And that file will trigger reading the 2 tables for the collection???
162 my $proc_mode = $self->{'process_mode'};
163 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
164 # read in meta for the collection (i.e. select * from <col>_metadata table
165
166 my $meta_table = $gs_sql->get_metadata_table_name();
167
168 # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
169 my $sth = $gs_sql->get_db_handle()->prepare(qq{SELECT * FROM $meta_table WHERE did = ?});
170 print $outhandle "### stmt: ".$sth->{'Statement'}."\n"
171 if $self->{'verbosity'} > 1;
172 $sth->execute( $oid ); # will print msg on fail
173
174 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
175 while( my @row = $sth->fetchrow_array() ) {
176 #print $outhandle "row: @row\n";
177 my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
178
179 # get rid of the artificial "root" introduced in section id when saving to sql db
180 $sid =~ s@^root@@;
181 $sid = $doc_obj->get_top_section() unless $sid;
182 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
183 if $self->{'verbosity'} > 1;
184
185 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
186 $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
187 }
188 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
189 if $self->{'verbosity'} > 1;
190 }
191
192 if($proc_mode eq "all" || $proc_mode eq "text_only") {
193 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
194
195 my $fulltxt_table = $gs_sql->get_fulltext_table_name();
196
197
198 my $sth = $gs_sql->get_db_handle()->prepare(qq{SELECT * FROM $fulltxt_table WHERE did = ?});
199 print $outhandle "### stmt: ".$sth->{'Statement'}."\n"
200 if $self->{'verbosity'} > 1;
201 $sth->execute( $oid ); # will print msg on fail
202
203 print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
204 if $self->{'verbosity'} > 1;
205 while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {
206
207 # get rid of the artificial "root" introduced in section id when saving to sql db
208 #$sid =~ s@^root@@;
209 $sid = $doc_obj->get_top_section() if ($sid eq "root");
210 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT NOT PRINTED>\n"
211 if $self->{'verbosity'} > 1;
212
213 # TODO - pass by ref?
214 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
215 $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
216 }
217 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
218 if $self->{'verbosity'} > 1;
219 }
220
221
222 # don't forget to clean up on close() in superclass
223 # It will get the doc_obj indexed then make it undef
224 $self->SUPER::close_document(@_);
225}
226
227
228# TODO: only want to work with sql db if buildcol.pl
229# call init() not begin() because there can be multiple plugin passes
230# and init() should be called before all passes:
231# one for doc level and another for section level indexing
232sub init {
233 my ($self) = shift (@_);
234# print STDERR "@@@@@@@@@@ INIT CALLED\n";
235
236 $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
237
238 # TODO: how do we know what site we're dealing with unless this is passed in, by buildcol?
239 ###########
240# print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
241# print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
242
243 $self->{'db_driver'} = "mysql";
244 $self->{'site_name'} = "localsite";
245 $self->{'db_client_user'} = "root";
246 $self->{'db_client_pwd'} = "6reenstone3";
247 $self->{'build_mode'} = "removeold";
248 $self->{'db_host'} = "127.0.0.1";
249 $self->{'db_encoding'} = "utf8";
250 ###########
251
252 my $gs_sql = new gssql({
253 'collection_name' => $ENV{'GSDLCOLLECTION'},
254 'db_encoding' => $self->{'db_encoding'}
255 #'db_name' => $self->{'site_name'},
256 #'build_mode' => $self->{'build_mode'},
257 }
258 );
259
260 # try connecting to the mysql db, if that fails it will die
261 if(!$gs_sql->connect_to_db({
262 'db_driver' => $self->{'db_driver'},
263 'db_client_user' => $self->{'db_client_user'},
264 'db_client_pwd' => $self->{'db_client_pwd'},
265 'db_host' => $self->{'db_host'}
266 })
267 )
268 {
269 # This is fatal for the plugout, let's terminate here
270 # PrintError would already have displayed the warning message on connection fail
271 die("Could not connect to db. Can't proceed.\n");
272 }
273
274 my $db_name = $self->{'site_name'} || "localsite"; # one database per GS3 site
275 #my $build_mode = $self->{'build_mode'} || "removeold";
276
277 # the db and its tables should exist. Attempt to use the db:
278 if(!$gs_sql->use_db($db_name)) {
279
280 # This is fatal for the plugout, let's terminate here after disconnecting again
281 # PrintError would already have displayed the warning message on load fail
282 $gs_sql->disconnect_from_db()
283 || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
284 die("Could not use db $db_name. Can't proceed.\n");
285 }
286
287 # store db handle now that we're connected
288 $self->{'gs_sql'} = $gs_sql;
289
290}
291
292
293# Call deinit() not end() because there can be multiple plugin passes:
294# one for doc level and another for section level indexing
295# and deinit() should be called before all passes
296sub deinit {
297 my ($self) = shift (@_);
298 if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
299 $self->{'gs_sql'}->disconnect_from_db()
300 || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
301 }
302# print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
303 $self->SUPER::deinit(@_);
304}
305
306sub read {
307 my $self = shift (@_);
308
309 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
310
311 # when running buildcol.pl, the filename should match "docsql-<OID>.xml"
312 # when running import.pl it will be the original document's filename
313 # we only want to read in from db when running buildcol.pl
314
315 # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)
316 # is created in super::open_document() and is made undef again on super::close_document().
317 # Further, can't read it in from doc.xml to work out which OID to query in sql db:
318 # even if we got access to doc_obj, if no meta stored in docsql.xml, then when
319 # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename
320 # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self
321 if($file =~ m/docsql-(.+?)\.xml$/) {
322
323 # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing
324 # subfolder besides filename, e.g. "dir/docsql-<OID>.xml"
325
326 # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original
327 (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;
328
329 $self->{'doc_oid'} = $oid;
330 }
331
332
333 # always read docsql.xml, as we then know doc structure, and assoc files are dealt with
334 # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db
335 return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj
336
337
338}
Note: See TracBrowser for help on using the repository browser.