source: main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm@ 32538

Last change on this file since 32538 was 32538, checked in by ak19, 6 years ago

Previous commit message meant to be: string names of strings shared by GS SQL Plugin and Plugout have been changed in strings.properties to indicate both modules used them. Current commit: Some tidying up the new GreenstoneSQLPlugin and moving the select statements from there into gssql.pm.

File size: 13.3 KB
Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql-<OID>.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gssql;
38
39
40# TODO:
41# - Run TODOs here and in plugout by Dr Bainbridge. Ask about docsql naming convention adopted
42# to identify OID. Better way?
43# collection names -> table names: hyphens not allowed? Changed to underscores.
44# - Startup parameters
45# - incremental building: where do we need to add code to delete rows from our sql table after
46# incrementally importing a coll with fewer docs (for instance)? What about deleted/modified meta?
47# - Courier documents in lucene-sql collection: character (degree symbol) not preserved. Is this because we encode in utf8 when putting into db and reading back in?
48# - Have not yet tested writing out just meta or just fulltxt to sql db and reading just that
49# back in from the sql db while the remainder is to be read back in from the docsql .xml files.
50
51
52# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that it if meta or fulltext
53# is still written out to doc.xml (docsql .xml), that will be processed as usual,
54# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
55# is written out by GreenstoneSQLPlugout into the SQL db).
56
57
58sub BEGIN {
59 @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
60}
61
62# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
63# So we won't have a process exp conflict here.
64sub get_default_process_exp {
65 my $self = shift (@_);
66
67 #return q^(?i)docsql(-\d+)?\.xml$^;
68 return q^(?i)docsql(-.+)?\.xml$^;
69}
70
71my $process_mode_list =
72 [ { 'name' => "meta_only",
73 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
74 { 'name' => "text_only",
75 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
76 { 'name' => "all",
77 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
78
79my $arguments =
80 [ { 'name' => "process_exp",
81 'desc' => "{BaseImporter.process_exp}",
82 'type' => "regexp",
83 'deft' => &get_default_process_exp(),
84 'reqd' => "no" },
85 { 'name' => "process_mode",
86 'desc' => "{GreenstoneSQLPlug.process_mode}",
87 'type' => "enum",
88 'list' => $process_mode_list,
89 'deft' => "all",
90 'reqd' => "no"}
91 ];
92
93my $options = { 'name' => "GreenstoneSQLPlugin",
94 'desc' => "{GreenstoneSQLPlugin.desc}",
95 'abstract' => "no",
96 'inherits' => "yes",
97 'args' => $arguments };
98
99
100# TODO: For on cancel, add a SIGTERM handler or so to call end()
101# or to explicitly call gs_sql->close_connection if $gs_sql def
102
103sub new {
104 my ($class) = shift (@_);
105 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
106 push(@$pluginlist, $class);
107
108 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
109 push(@{$hashArgOptLists->{"OptList"}},$options);
110
111 my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
112
113
114 #return bless $self, $class;
115 $self = bless $self, $class;
116 if ($self->{'info_only'}) {
117 # If running pluginfo, we don't need to go further.
118 return $self;
119 }
120
121 # do anything else that needs to be done here when not pluginfo
122
123 return $self;
124}
125
126
127# TODO Q: Why are there 3 passes when we're only indexing at doc and section level (2 passes)?
128
129# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
130# the doc_obj in memory is processed (indexed) and then made undef.
131# So we have to work with doc_obj before superclass close_document() is finished.
132sub close_document {
133 my $self = shift(@_);
134
135 my $outhandle = $self->{'outhandle'};
136 my $doc_obj = $self->{'doc_obj'};
137 my $gs_sql = $self->{'gs_sql'};
138
139 my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub read()
140 print $outhandle "==== OID of document (meta|text) to be read in from DB: $oid\n"
141 if $self->{'verbosity'} > 1;
142
143
144 # For now, we have access to doc_obj (until just before super::close_document() terminates)
145
146 $self->{'doc_obj'}->set_OID($oid); # complex method. Is this necessary, since we just want to write meta and txt for the docobj to index?
147
148 # checking that complicated looking method set_OID() hasn't modified oid
149 if($oid ne $self->{'doc_obj'}->get_OID()) {
150 print STDERR "@@@@ WARNING: OID after setting on doc_obj = " . $self->{'doc_obj'}->get_OID() . " and is not the same as original OID $oid from docsqloid.xml filename\n";
151 }
152
153
154 # TODO: This function is called on a per doc.xml file basis
155 # but we can process all docs of a collection in one go when dealing with the SQL tables for
156 # the collection. How and where should we read in the collection tables then?
157 # TODO: Perhaps MySQLPlugout could write out a token file (.gssql) into archives during import.pl
158 # and if that file is detected, then MySQLPlugin::read() is passed in that file during
159 # buildcol.pl. And that file will trigger reading the 2 tables for the collection???
160 my $proc_mode = $self->{'process_mode'};
161 if($proc_mode eq "all" || $proc_mode eq "meta_only") {
162 # read in meta for the collection (i.e. select * from <col>_metadata table
163
164 my $sth = $gs_sql->select_from_metatable_matching_docid($oid);
165 print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
166
167 print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 1;
168 # https://www.effectiveperlprogramming.com/2010/07/set-custom-dbi-error-handlers/
169 while( my @row = $sth->fetchrow_array() ) {
170 #print $outhandle "row: @row\n";
171 my ($primary_key, $did, $sid, $metaname, $metaval) = @row;
172
173 # get rid of the artificial "root" introduced in section id when saving to sql db
174 $sid =~ s@^root@@;
175 $sid = $doc_obj->get_top_section() unless $sid;
176 print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
177 if $self->{'verbosity'} > 1;
178
179 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_meta directly:
180 $doc_obj->add_utf8_metadata($sid, $metaname, &docprint::unescape_text($metaval));
181 }
182 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
183 if $self->{'verbosity'} > 1;
184 }
185
186 if($proc_mode eq "all" || $proc_mode eq "text_only") {
187 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
188
189 my $fulltxt_table = $gs_sql->get_fulltext_table_name();
190
191
192 my $sth = $gs_sql->select_from_texttable_matching_docid($oid);
193 print $outhandle "### stmt: ".$sth->{'Statement'}."\n" if $self->{'verbosity'} > 1;
194
195 print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
196 if $self->{'verbosity'} > 1;
197 while( my ($primary_key, $did, $sid, $text) = $sth->fetchrow_array() ) {
198
199 # get rid of the artificial "root" introduced in section id when saving to sql db
200 #$sid =~ s@^root@@;
201 $sid = $doc_obj->get_top_section() if ($sid eq "root");
202 print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT NOT PRINTED>\n"
203 if $self->{'verbosity'} > 1;
204
205 # TODO - pass by ref?
206 # TODO: we accessed the db in utf8 mode, so, we can call doc_obj->add_utf8_text directly:
207 $doc_obj->add_utf8_text($sid, &docprint::unescape_text($text));
208 }
209 print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
210 if $self->{'verbosity'} > 1;
211 }
212
213
214 # don't forget to clean up on close() in superclass
215 # It will get the doc_obj indexed then make it undef
216 $self->SUPER::close_document(@_);
217}
218
219
220# TODO: only want to work with sql db if buildcol.pl. Unfortunately, also runs on import.pl
221# call init() not begin() because there can be multiple plugin passes
222# and init() should be called before all passes:
223# one for doc level and another for section level indexing
224# This way, we can connect to the SQL database once per buildcol run.
225sub init {
226 my ($self) = shift (@_);
227# print STDERR "@@@@@@@@@@ INIT CALLED\n";
228
229 $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
230
231 # TODO: how do we know what site we're dealing with unless this is passed in, by buildcol?
232 ###########
233# print "@@@ SITE NAME: ". $self->{'site_name'} . "\n" if defined $self->{'site_name'};
234# print "@@@ COLL NAME: ". $ENV{'GSDLCOLLECTION'} . "\n";
235
236 $self->{'db_driver'} = "mysql";
237 $self->{'site_name'} = "localsite";
238 $self->{'db_client_user'} = "root";
239 $self->{'db_client_pwd'} = "6reenstone3";
240 $self->{'build_mode'} = "removeold";
241 $self->{'db_host'} = "127.0.0.1";
242 $self->{'db_encoding'} = "utf8";
243 ###########
244
245 my $gs_sql = new gssql({
246 'collection_name' => $ENV{'GSDLCOLLECTION'},
247 'db_encoding' => $self->{'db_encoding'}
248 #'db_name' => $self->{'site_name'},
249 #'build_mode' => $self->{'build_mode'},
250 }
251 );
252
253 # try connecting to the mysql db, if that fails it will die
254 if(!$gs_sql->connect_to_db({
255 'db_driver' => $self->{'db_driver'},
256 'db_client_user' => $self->{'db_client_user'},
257 'db_client_pwd' => $self->{'db_client_pwd'},
258 'db_host' => $self->{'db_host'}
259 })
260 )
261 {
262 # This is fatal for the plugout, let's terminate here
263 # PrintError would already have displayed the warning message on connection fail
264 die("Could not connect to db. Can't proceed.\n");
265 }
266
267 my $db_name = $self->{'site_name'} || "localsite"; # one database per GS3 site
268 #my $build_mode = $self->{'build_mode'} || "removeold";
269
270 # the db and its tables should exist. Attempt to use the db:
271 if(!$gs_sql->use_db($db_name)) {
272
273 # This is fatal for the plugout, let's terminate here after disconnecting again
274 # PrintError would already have displayed the warning message on load fail
275 $gs_sql->disconnect_from_db()
276 || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
277 die("Could not use db $db_name. Can't proceed.\n");
278 }
279
280 # store db handle now that we're connected
281 $self->{'gs_sql'} = $gs_sql;
282
283}
284
285# This method also runs on import.pl if gs_sql has a value. But we just want to run it on buildcol
286# Call deinit() not end() because there can be multiple plugin passes:
287# one for doc level and another for section level indexing
288# and deinit() should be called before all passes
289# This way, we can close the SQL database once per buildcol run.
290sub deinit {
291 my ($self) = shift (@_);
292 if($self->{'gs_sql'}) { # can cover TODO: only want to work with sql db if buildcol.pl
293 $self->{'gs_sql'}->disconnect_from_db()
294 || warn("Unable to disconnect from database " . $self->{'site_name'} . "\n");
295 }
296# print STDERR "@@@@@@@@@@ DEINIT CALLED\n";
297 $self->SUPER::deinit(@_);
298}
299
300sub read {
301 my $self = shift (@_);
302
303 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
304
305 # when running buildcol.pl, the filename should match "docsql-<OID>.xml"
306 # when running import.pl it will be the original document's filename
307 # we only want to read in from db when running buildcol.pl
308
309 # doc_obj doesn't exist yet and only exists during super::read(): a new doc (doc_obj)
310 # is created in super::open_document() and is made undef again on super::close_document().
311 # Further, can't read it in from doc.xml to work out which OID to query in sql db:
312 # even if we got access to doc_obj, if no meta stored in docsql.xml, then when
313 # doc_obj is read in from docsql.xml there will be no OID. So OID is docsql.xml filename
314 # contains OID in filename. Having extracted OID from the filename, store OID in plugin-self
315 if($file =~ m/docsql-(.+?)\.xml$/) {
316
317 # work out docoid from filename of form "docsql-<OID>.xml". $file can have a containing
318 # subfolder besides filename, e.g. "dir/docsql-<OID>.xml"
319
320 # https://stackoverflow.com/questions/22836/how-do-i-perform-a-perl-substitution-on-a-string-while-keeping-the-original
321 (my $oid = $file) =~ s@^(.*?)docsql-(.+?)\.xml$@$2@;
322
323 $self->{'doc_oid'} = $oid;
324 }
325
326
327 # always read docsql.xml, as we then know doc structure, and assoc files are dealt with
328 # Plus we need to read docsql.xml if either meta or fulltxt went into there instead of to sql db
329 return $self->SUPER::read(@_); # will open_doc, close_doc then process doc_obj for indexing, then undef doc_obj
330
331
332}
Note: See TracBrowser for help on using the repository browser.