root/main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm @ 32595

Revision 32595, 17.0 KB (checked in by ak19, 6 months ago)

Major tidying up: last remaining debug statements, lots of comments, removed TODO lists.

Line 
1###########################################################################
2#
3# GreenstoneSQLPlugin.pm -- reads into doc_obj from SQL db and docsql.xml
4# Metadata and/or fulltext are stored in SQL db, the rest may be stored in
5# the docsql .xml files.
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package GreenstoneSQLPlugin;
29
30
31use strict;
32no strict 'refs'; # allow filehandles to be variables and viceversa
33
34use DBI;
35use docprint; # for new unescape_text() subroutine
36use GreenstoneXMLPlugin;
37use gsmysql;
38
39
40
41########################################################################################
42
43# GreenstoneSQLPlugin inherits from GreenstoneXMLPlugin so that if meta or fulltext
44# is still written out to doc.xml (docsql .xml), that will be processed as usual,
45# whereas GreenstoneSQLPlugin will process all the rest (full text and/or meta, whichever
46# is written out by GreenstoneSQLPlugout into the SQL db).
47
48
49sub BEGIN {
50    @GreenstoneSQLPlugin::ISA = ('GreenstoneXMLPlugin');
51}
52
53# This plugin must be in the document plugins pipeline IN PLACE OF GreenstoneXMLPlugin
54# So we won't have a process exp conflict here.
55# The structure of docsql.xml files is identical to doc.xml and the contents are similar except:
56#   - since metadata and/or fulltxt are stored in mysql db instead, just XML comments indicating
57#   this are left inside docsql.xml within the <Description> (for meta) and/or <Content> (for txt)
58#   - the root element Archive now has a docoid attribute: <Archive docoid="OID">
59sub get_default_process_exp {
60    my $self = shift (@_);
61
62    return q^(?i)docsql(-\d+)?\.xml$^; # regex based on this method in GreenstoneXMLPlugin
63}
64
65my $process_mode_list =
66    [ { 'name' => "meta_only",
67        'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },     
68      { 'name' => "text_only",
69        'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
70      { 'name' => "all",
71        'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
72
73my $rollback_on_cancel_list =
74    [ { 'name' => "true",
75        'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}" },     
76      { 'name' => "false",
77        'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ];
78
79# NOTE: If subclassing gsmysql for other supporting databases and if they have different required
80# connection parameters, we can check how WordPlugin, upon detecting Word is installed,
81# dynamically loads Word specific configuration options.
82my $arguments =
83    [ { 'name' => "process_exp",
84    'desc' => "{BaseImporter.process_exp}",
85    'type' => "regexp",
86    'deft' => &get_default_process_exp(),
87    'reqd' => "no" },
88      { 'name' => "process_mode",
89    'desc' => "{GreenstoneSQLPlug.process_mode}",
90    'type' => "enum",
91    'list' => $process_mode_list,
92    'deft' => "all",
93    'reqd' => "no"},
94      { 'name' => "rollback_on_cancel",
95    'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}",
96    'type' => "enum",
97    'list' => $rollback_on_cancel_list,
98    'deft' => "false", # better default than true
99    'reqd' => "no",
100    'hiddengli' => "no"},
101      { 'name' => "db_driver",
102    'desc' => "{GreenstoneSQLPlug.db_driver}",
103    'type' => "string",
104    'deft' => "mysql",
105    'reqd' => "yes"},
106      { 'name' => "db_client_user",
107    'desc' => "{GreenstoneSQLPlug.db_client_user}",
108    'type' => "string",
109    'deft' => "root",
110    'reqd' => "yes"},
111      { 'name' => "db_client_pwd",
112    'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
113    'type' => "string",
114    'deft' => "",
115    'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd
116      { 'name' => "db_host",
117    'desc' => "{GreenstoneSQLPlug.db_host}",
118    'type' => "string",
119    'deft' => "127.0.0.1", # NOTE: make this int? No default for port, since it's not a required connection param
120    'reqd' => "yes"},
121      { 'name' => "db_port",
122    'desc' => "{GreenstoneSQLPlug.db_port}",
123    'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param
124    'reqd' => "no"}
125    ];
126
127my $options = { 'name'     => "GreenstoneSQLPlugin",
128        'desc'     => "{GreenstoneSQLPlugin.desc}",
129        'abstract' => "no",
130        'inherits' => "yes",
131            'args'     => $arguments };
132
133
134###### Methods called during buildcol and import #######
135
136sub new {
137    my ($class) = shift (@_);
138    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
139    push(@$pluginlist, $class);
140
141    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
142    push(@{$hashArgOptLists->{"OptList"}},$options);
143
144    my $self = new GreenstoneXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
145
146   
147    #return bless $self, $class;
148    $self = bless $self, $class;
149    if ($self->{'info_only'}) {
150    # If running pluginfo, we don't need to go further.
151    return $self;
152    }
153
154    # do anything else that needs to be done here when not pluginfo
155   
156    return $self;
157}
158
159# Call init() not begin() because there can be multiple plugin passes and begin() called for
160# each pass (one for doc level and another for section level indexing), whereas init() should
161# be called before any and all passes.
162# This way, we can connect to the SQL database once per buildcol run.
163# Although now it doesn't matter, since gsmysql.pm uses the get_instance pattern to return a
164# singleton db connection, regardless of the number of gsmysql objects instantiated and
165# the number of connect() calls made on them.
166sub init {
167    my ($self) = shift (@_);
168   
169    $self->SUPER::init(@_); # super (GreenstoneXMLPlugin) will not yet be trying to read from doc.xml (docsql .xml) files in init().
170
171
172    # create gsmysql object.
173    # collection name will be used for naming tables (site name will be used for naming database)
174    my $gs_sql = new gsmysql({
175    'collection_name' => $ENV{'GSDLCOLLECTION'},
176    'verbosity' => $self->{'verbosity'} || 0
177               });
178   
179    # if autocommit is set, there's no rollback support
180    my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0;
181
182    # try connecting to the mysql db, die if that fails
183    if(!$gs_sql->connect_to_db({
184    'db_driver' => $self->{'db_driver'},
185    'db_client_user' => $self->{'db_client_user'},
186    'db_client_pwd' => $self->{'db_client_pwd'},
187    'db_host' => $self->{'db_host'},
188    'db_port' => $self->{'db_port'}, # undef by default, can leave as is
189    'autocommit' => $autocommit
190                   })
191    )
192    {
193    # This is fatal for the plugout, let's terminate here
194    # PrintError would already have displayed the warning message on connection fail   
195    die("Could not connect to db. Can't proceed.\n");
196    }
197   
198    my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
199
200    # Attempt to use the db, create it if it doesn't exist (but don't create the tables yet)
201    # Bail if we can't use the database
202    if(!$gs_sql->use_db($db_name)) {
203   
204    # This is fatal for the plugout, let's terminate here after disconnecting again
205    # PrintError would already have displayed the warning message on load fail
206    # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
207    die("Could not use db $db_name. Can't proceed.\n");
208    }
209   
210   
211    # store db handle now that we're connected
212    $self->{'gs_sql'} = $gs_sql;   
213}
214
215
216# This method also runs on import.pl if gs_sql has a value.
217# Call deinit() not end() because there can be multiple plugin passes:
218# one for doc level and another for section level indexing
219# and deinit() should be called before all passes
220# This way, we can close the SQL database once per buildcol run.
221# Again, this doesn't matter because we gsmysql the ensures the connection
222# is a singleton connection instance, which connects once and disconnects once per perl process.
223sub deinit {
224    my ($self) = shift (@_);
225   
226    if($self->{'gs_sql'}) {
227
228    # Important to call finished():
229    # it will disconnect from db if this is the last gsmysql instance,
230    # and it will commit to db before disconnecting if rollbback_on_cancel turned on
231    $self->{'gs_sql'}->finished();
232
233    # Clear gs_sql (setting key to undef has a different meaning from deleting:
234    # undef makes key still exist but its value is unded whereas delete deletes the key)
235    # So all future use has to make the connection again
236    delete $self->{'gs_sql'};
237    }
238
239    $self->SUPER::deinit(@_);
240}
241
242
243
244###### Methods only called during import.pl #####
245
246# This is called once if removeold is set with import.pl. Most plugins will do
247# nothing but if a plugin does any stuff outside of creating doc obj, then
248# it may need to clear something.
249# In the case of GreenstoneSQL plugs: this is the first time we have a chance
250# to purge the tables of the current collection from the current site's database
251sub remove_all {
252    my $self = shift (@_);
253    my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
254
255    $self->SUPER::remove_all(@_);
256   
257    print STDERR "   Building with removeold option set, so deleting current collection's tables if they exist\n" if($self->{'verbosity'});
258   
259    # if we're in here, we'd already have run 'use database <site>;' during sub init()
260    # so we can go ahead and delete the collection's tables
261    my $gs_sql = $self->{'gs_sql'};
262    $gs_sql->delete_collection_tables(); # will delete them if they exist
263
264    # and recreate tables? No. Tables' existence is ensured in GreenstoneSQLPlugout::begin()
265    my $proc_mode = $self->{'process_mode'};
266    if($proc_mode ne "text_only") {
267    $gs_sql->ensure_meta_table_exists();
268    }
269    if($proc_mode ne "meta_only") {
270    $gs_sql->ensure_fulltxt_table_exists();
271    }
272
273}
274
275# This is called during import.pl per document for docs that have been deleted from the
276# collection. Most plugins will do nothing
277# but if a plugin does any stuff outside of creating doc obj, then it may need
278# to clear something.
279# In the case of GreenstoneSQL plugs: Remove the doc(s) denoted by oids from GS SQL db.
280# This takes care of incremental deletes (docs marked D by ArchivesInfPlugin when building
281# incrementally) as well as cases of "Non-icremental Delete", see ArchivesInfPlugin.pm.
282# As well as cases involving reindexing, which are implemented here as delete followed by add.
283sub remove_one {
284    my $self = shift (@_);
285   
286    my ($file, $oids, $archivedir) = @_;
287
288    my $rv = $self->SUPER::remove_one(@_);
289   
290    print STDERR "@@@ IN SQLPLUG::REMOVE_ONE: $file\n";
291   
292    #return undef unless $self->can_process_this_file($file); # NO, DON'T DO THIS (inherited remove_one behaviour) HERE:
293           # WE DON'T CARE IF IT'S AN IMAGE FILE THAT WAS DELETED.
294           # WE CARE ABOUT REMOVING THE DOC_OID OF THAT IMAGE FILE FROM THE SQL DB
295           # SO DON'T RETURN IF CAN'T_PROCESS_THIS_FILE
296   
297   
298    my $gs_sql = $self->{'gs_sql'} || return 0; # couldn't make the connection or no db etc
299   
300    my $proc_mode = $self->{'process_mode'};
301    foreach my $oid (@$oids) { 
302    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
303        print STDERR "@@@@@@@@ Deleting $oid from meta table\n" if $self->{'verbosity'} > 2;
304        $gs_sql->delete_recs_from_metatable_with_docid($oid);
305    }
306    if($proc_mode eq "all" || $proc_mode eq "text_only") {
307        print STDERR "@@@@@@@@ Deleting $oid from fulltxt table\n" if $self->{'verbosity'} > 2;
308        $gs_sql->delete_recs_from_texttable_with_docid($oid);
309    }
310    }
311    return $rv;
312}
313
314##### Methods called only during buildcol #####
315
316sub xml_start_tag {
317    my $self = shift(@_);
318    my ($expat, $element) = @_;
319
320    my $outhandle = $self->{'outhandle'};
321   
322    $self->{'element'} = $element;
323    if ($element eq "Archive") { # docsql.xml files contain a OID attribute on Archive element
324    # the element's attributes are in %_ as per ReadXMLFile::xml_start_tag() (while $_
325    # contains the tag)
326
327    # Don't access %_{'docoid'} directly: keep getting a warning message to
328    # use $_{'docoid'} for scalar contexts, but %_ is the element's attr hashmap
329    # whereas $_ has the tag info. So we don't want to do $_{'docoid'}.
330    my %attr_hash = %_; # right way, see OAIPlugin.pm
331    $self->{'doc_oid'} = $attr_hash{'docoid'};
332    print $outhandle "Extracted OID from docsql.xml: ".$self->{'doc_oid'}."\n"
333        if $self->{'verbosity'} > 2;
334
335    }
336    else { # let superclass GreenstoneXMLPlugin continue to process <Section> and <Metadata> elements
337    $self->SUPER::xml_start_tag(@_);
338    }
339}
340
341# There are multiple passes processing the document (see buildcol's mode parameter description):
342# - compressing the text which may be a dummy pass for lucene/solr, wherein they still want the
343# docobj for different purposes,
344# - the pass(es) for indexing, e.g. doc/didx and section/sidx level passes
345# - and an infodb pass for processing the classifiers. This pass too needs the docobj
346# Since all passes need the doc_obj, all are read in from docsql + SQL db into the docobj in memory
347
348# We only ever get here or do any parsing of the docsql.xml file during the buildcol.pl phase.
349# At the end of superclass GreenstoneXMLPlugin.pm's close_document() method,
350# the doc_obj in memory is processed (indexed) and then made undef.
351# So we have to work with doc_obj before superclass close_document() is finished.
352sub close_document {
353    my $self = shift(@_);
354
355    my $gs_sql = $self->{'gs_sql'};
356   
357    my $outhandle = $self->{'outhandle'};
358    my $doc_obj = $self->{'doc_obj'};
359
360    my $oid = $self->{'doc_oid'}; # we stored current doc's OID during sub xml_start_tag()
361    my $proc_mode = $self->{'process_mode'};
362   
363    # For now, we have access to doc_obj (until just before super::close_document() terminates)
364
365    # OID parsed of docsql.xml file does need to be set on $doc_obj, as noticed in this case:
366    # when a doc in import is renamed, and you do incremental import, it is marked for reindexing
367    # (reindexing is implemented by this plugin as a delete followed by add into the sql db).
368    # In that case, UNLESS you set the OID at this stage, the old deleted doc id (for the old doc
369    # name) continues to exist in the index at the end of incremental rebuilding if you were to
370    # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted
371    # doc oids will still be listed in the index.
372    $self->{'doc_obj'}->set_OID($oid);
373   
374    print STDERR "   GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n"
375    if $self->{'verbosity'};
376   
377    if($proc_mode eq "all" || $proc_mode eq "meta_only") {
378    # read in meta for the collection (i.e. select * from <col>_metadata table
379   
380    my $records = $gs_sql->select_from_metatable_matching_docid($oid, $outhandle);
381   
382    print $outhandle "----------SQL DB contains meta-----------\n" if $self->{'verbosity'} > 2;
383
384    foreach my $row (@$records) {
385        my ($primary_key, $did, $sid, $metaname, $metaval) = @$row;
386       
387        # get rid of the artificial "root" introduced in section id when saving to sql db
388        $sid =~ s@^root@@;
389        $sid = $doc_obj->get_top_section() unless $sid;
390        print $outhandle "### did: $did, sid: |$sid|, meta: $metaname, val: $metaval\n"
391        if $self->{'verbosity'} > 2;
392       
393        # We're only dealing with utf8 data where docobj is concerned
394        # Data stored unescaped in db: escaping only for html/xml files, not for txt files or db
395        $doc_obj->add_utf8_metadata($sid, $metaname, $metaval);
396    }
397    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
398        if $self->{'verbosity'} > 2;
399    }
400   
401    if($proc_mode eq "all" || $proc_mode eq "text_only") {
402    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
403   
404    my $fulltxt_table = $gs_sql->get_fulltext_table_name();
405   
406   
407    my $records = $gs_sql->select_from_texttable_matching_docid($oid, $outhandle);
408   
409   
410    print $outhandle "----------\nSQL DB contains txt entries for-----------\n"
411        if $self->{'verbosity'} > 2;
412
413    foreach my $row (@$records) {
414        my ($primary_key, $did, $sid, $text) = @$row;
415       
416        # get rid of the artificial "root" introduced in section id when saving to sql db
417        #$sid =~ s@^root@@;
418        $sid = $doc_obj->get_top_section() if ($sid eq "root");
419        print $outhandle "### did: $did, sid: |$sid|, fulltext: <TXT>\n"
420        if $self->{'verbosity'} > 2;
421
422        # We're only dealing with utf8 data where docobj is concerned
423        # Data stored unescaped in db: escaping is only for html/xml files, not for txt files or db
424        $doc_obj->add_utf8_textref($sid, \$text);
425    }   
426    print $outhandle "----------FIN READING DOC's TXT FROM SQL DB------------\n"
427        if $self->{'verbosity'} > 2;
428    }
429   
430    # done reading into docobj from SQL db
431   
432    # don't forget to clean up on close() in superclass
433    # It will get the doc_obj indexed then make it undef
434    $self->SUPER::close_document(@_);
435}
436
437
4381;
Note: See TracBrowser for help on using the browser.