source: main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm@ 32640

Last change on this file since 32640 was 32640, checked in by ak19, 5 years ago

Important changes (and commented out debugging statements) to get charset encodings (utf8 or specifically utf8mb4 in perl mysql case) to work with GS SQL Plugs on Windows. First, Strawberry Perl 5.22 was required and hence committed before this since Strawberry Perl 5.18 came with older mysql DBD/DBI packages that didn't decode utf8 content in the database when content was retrieved. Strawberry Perl 5.22 came with newer versions of DBD and DBI that do this automatically, as has been the case in Linux testing where Ubuntu had Perl 5.22 with sufficiently new versions of the DBI/DBD mysql perl packages. The newer Perl and specifically the newer MySQL DBD/DBI packages required some important changes to the gsmysql.pm code in the way charset encoding stuff is configured. This should work on Linux too, as that already allowed 2 ways to configure DB encoding stuff. I chose the single-line version on Linux, no longer supported with the DBI/DBD upgrade that comes with our new Strawberry Perl 5.22, so opting for the 2 line version to setup the DB encoding stuff which works on Windows and should continue to work on Linux too (where I had tested it before settling on the single-line variant).

File size: 12.7 KB
RevLine 
[32518]1###########################################################################
2#
[32527]3# GreenstoneSQLPlugout.pm -- plugout module for writing all or some the
4# Greenstone document format (metadata and/or fulltext) into a (My)SQL db.
[32526]5# The rest is then still written out by GreenstoneXMLPlugout as usual.
[32518]6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2006 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[32527]28package GreenstoneSQLPlugout;
[32518]29
30use strict;
31no strict 'refs';
32no strict 'subs';
33
[32520]34use GreenstoneXMLPlugout;
[32518]35use docprint;
[32592]36use gsmysql;
[32518]37
[32524]38use DBI; # the central package for this plugout
39
[32640]40#use unicode;
[32518]41
[32595]42# This plugout does not output the metadata and/or fulltxt xml to a file,
43# but outputs rows into a MySQL db table for metadata and/or a db table for fulltxt
[32521]44
[32518]45sub BEGIN {
[32527]46 @GreenstoneSQLPlugout::ISA = ('GreenstoneXMLPlugout');
[32518]47}
48
[32520]49
[32518]50my $process_mode_list =
51 [ { 'name' => "meta_only",
[32537]52 'desc' => "{GreenstoneSQLPlug.process_mode.meta_only}" },
[32518]53 { 'name' => "text_only",
[32537]54 'desc' => "{GreenstoneSQLPlug.process_mode.text_only}" },
[32518]55 { 'name' => "all",
[32537]56 'desc' => "{GreenstoneSQLPlug.process_mode.all}" } ];
[32518]57
[32582]58my $rollback_on_cancel_list =
59 [ { 'name' => "true",
60 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}" },
61 { 'name' => "false",
62 'desc' => "{GreenstoneSQLPlug.rollbacl_on_cancel}" } ];
63
[32541]64# The following are the saveas.options:
[32518]65my $arguments = [
[32541]66 { 'name' => "process_mode",
67 'desc' => "{GreenstoneSQLPlug.process_mode}",
68 'type' => "enum",
69 'list' => $process_mode_list,
70 'deft' => "all",
71 'reqd' => "no",
72 'hiddengli' => "no"},
[32582]73 { 'name' => "rollback_on_cancel",
74 'desc' => "{GreenstoneSQLPlug.rollback_on_cancel}",
75 'type' => "enum",
76 'list' => $rollback_on_cancel_list,
[32591]77 'deft' => "false", # better default than true
[32582]78 'reqd' => "no",
79 'hiddengli' => "no"},
[32541]80 { 'name' => "db_driver",
81 'desc' => "{GreenstoneSQLPlug.db_driver}",
82 'type' => "string",
83 'deft' => "mysql",
84 'reqd' => "yes"},
85 { 'name' => "db_client_user",
86 'desc' => "{GreenstoneSQLPlug.db_client_user}",
87 'type' => "string",
88 'deft' => "root",
89 'reqd' => "yes"},
90 { 'name' => "db_client_pwd",
91 'desc' => "{GreenstoneSQLPlug.db_client_pwd}",
92 'type' => "string",
93 'deft' => "",
[32591]94 'reqd' => "no"}, # pwd not required: can create mysql accounts without pwd
[32541]95 { 'name' => "db_host",
96 'desc' => "{GreenstoneSQLPlug.db_host}",
97 'type' => "string",
[32591]98 'deft' => "127.0.0.1", # localhost doesn't work for us, but 127.0.0.1 works. See gsmysql.pm
[32589]99 'reqd' => "yes"},
100 { 'name' => "db_port",
101 'desc' => "{GreenstoneSQLPlug.db_port}",
102 'type' => "string", # NOTE: make this int? No default for port, since it's not a required connection param
103 'reqd' => "no"}
[32541]104 ];
[32518]105
[32527]106my $options = { 'name' => "GreenstoneSQLPlugout",
107 'desc' => "{GreenstoneSQLPlugout.desc}",
[32518]108 'abstract' => "no",
109 'inherits' => "yes",
110 'args' => $arguments };
111
[32583]112##### This entire class is called only during import.pl #####
113
114##### Overridden methods #####
115
[32518]116sub new {
117 my ($class) = shift (@_);
118 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
119 push(@$plugoutlist, $class);
120
121 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
122 push(@{$hashArgOptLists->{"OptList"}},$options);
123
124 my $self = new GreenstoneXMLPlugout($plugoutlist,$inputargs,$hashArgOptLists);
125
126 if ($self->{'info_only'}) {
127 # don't worry about any options etc
128 return bless $self, $class;
129 }
130
131 return bless $self, $class;
132}
133
[32520]134# connect here and ensure all tables and databases exist
135sub begin {
136
137 my $self= shift (@_);
[32580]138
[32529]139 my $db_params = {
[32560]140 'collection_name' => $ENV{'GSDLCOLLECTION'},
[32580]141 'verbosity' => $self->{'verbosity'} || 0
[32582]142
[32529]143 };
144
[32592]145 my $gs_sql = new gsmysql($db_params);
[32582]146
147 # if autocommit is set, there's no rollback support
148 my $autocommit = ($self->{'rollback_on_cancel'} eq "false") ? 1 : 0;
[32520]149
[32578]150 # try connecting to the mysql db, die if that fails
151 # So don't bother preparing GreenstoneXMLPlugout by calling superclass' begin() yet
[32530]152 if(!$gs_sql->connect_to_db({
153 'db_driver' => $self->{'db_driver'},
154 'db_client_user' => $self->{'db_client_user'},
155 'db_client_pwd' => $self->{'db_client_pwd'},
[32582]156 'db_host' => $self->{'db_host'},
[32589]157 'db_port' => $self->{'db_port'}, # undef by default, can leave as is
[32582]158 'autocommit' => $autocommit
[32530]159 })
160 )
161 {
[32520]162 # This is fatal for the plugout, let's terminate here
163 # PrintError would already have displayed the warning message on connection fail
164 die("Could not connect to db. Can't proceed.\n");
165 }
[32582]166
[32586]167 my $db_name = $self->{'site'} || "greenstone2"; # one database per GS3 site, for GS2 the db is called greenstone2
[32563]168 my $proc_mode = $self->{'process_mode'};
[32555]169
[32563]170
171 my $success = $gs_sql->use_db($db_name);
172
173 if($success && $proc_mode ne "text_only") {
174 $success = $gs_sql->ensure_meta_table_exists();
175 }
176 if($success && $proc_mode ne "meta_only") {
177 $success = $gs_sql->ensure_fulltxt_table_exists();
178 }
[32573]179
[32563]180 if(!$success) {
[32536]181 # This is fatal for the plugout, let's terminate here after disconnecting again
182 # PrintError would already have displayed the warning message on load fail
[32592]183 # And on die() perl will call gsmysql destroy which will ensure a disconnect() from db
[32536]184 die("Could not use db $db_name and/or prepare its tables. Can't proceed.\n");
[32520]185 }
186
[32529]187 # store the DBI wrapper instance
188 $self->{'gs_sql'} = $gs_sql;
189
[32524]190
[32595]191 # If setting up connection to sql db failed, we'd have terminated and wouldn't come up to here
192 # and wouldn't have bothered preparing GreenstoneXMLPlugout by calling superclass' begin().
193 # Finally, can call begin on super - important as doc.xml is opened as a group etc
[32523]194
[32526]195 $self->SUPER::begin(@_);
[32520]196}
197
198# disconnect from database here, see inexport.pm
199sub end
200{
201 my $self = shift(@_);
202
[32521]203 # do the superclass stuff first, as any sql db failures should not prevent superclass cleanup
[32526]204 $self->SUPER::end(@_);
[32583]205
206 # Important to call finished():
[32592]207 # it will disconnect from db if this is the last gsmysql instance,
[32583]208 # and it will commit to db before disconnecting if rollbback_on_cancel turned on
209 $self->{'gs_sql'}->finished();
[32578]210 delete $self->{'gs_sql'}; # key gs_sql no longer exists, not just the value being undef
[32520]211}
[32533]212
[32542]213# Produce files called docsql.xml instead of doc.xml
[32536]214sub get_doc_xml_filename {
[32533]215 my $self = shift (@_);
[32536]216 my ($doc_obj) = @_;
217
[32542]218 return "docsql.xml";
[32533]219}
[32542]220
221# overriding to store doc OID as attribute of top level element: <Archive docoid="oid">
222sub output_xml_header {
223 my $self = shift (@_);
224 my ($outhandle, $doc_oid) = @_;
225
226 print $outhandle '<?xml version="1.0" encoding="utf-8" standalone="no"?>' . "\n";
227 print $outhandle "<!DOCTYPE Archive SYSTEM \"http://greenstone.org/dtd/Archive/1.0/Archive.dtd\">\n";
228 print $outhandle "<Archive docoid=\"$doc_oid\">\n";
229}
230
[32595]231# saveas() only generates the content in archives dir and in the SQL database
[32518]232sub saveas {
233 my $self = shift (@_);
234 my ($doc_obj, $doc_dir) = @_;
235
[32522]236 my $proc_mode = $self->{'process_mode'};
237
[32521]238 # 1. pre save out and saving debug handle
239
[32523]240 # must call superclass (pre/post) saveas methods, as they handle assoc_files too
[32526]241 my ($docxml_outhandler, $output_file) = $self->SUPER::pre_saveas(@_);
[32523]242
243 $self->{'debug_outhandle'} = $docxml_outhandler if ($self->{'debug'}); # STDOUT if debug
244
[32543]245 # 2. overriding saving behaviour to do what the superclass does (writing out doc.xml files,
246 # under new name of docsql.xml, with breadcrumbs pointing to sql db) PLUS saving to sql db
[32518]247
[32543]248 # NOTE: if proc_mode == all, then "breadcrumbs" (statements pointing viewer to the sql db
249 # for contents) go into both meta and txt elements of doc.xml (docsql.xml specifically):
[32522]250
[32543]251 # write the INVERSE into doc.xml as to what is written to the SQL db
[32523]252 my $docxml_output_options = { 'output' => docprint::OUTPUT_NONE };
253 if($proc_mode eq "meta_only" ) { # since only meta to go into MySQL db, text will go into docxml
254 $docxml_output_options->{'output'} = docprint::OUTPUT_TEXT_ONLY;
255 } elsif($proc_mode eq "text_only" ) { # since only full text to go into MySQL db, meta will go into docxml
256 $docxml_output_options->{'output'} = docprint::OUTPUT_META_ONLY;
[32518]257 }
[32521]258
[32523]259 # now we've prepared to write out whatever is meant to go into docxml
260 # and can do actual the steps superclass GreenstoneXMLPlugout carries out to write out docxml
[32543]261 # So: write out the doc xml file, "docsql.xml", for the current document
[32523]262 my $section_text = &docprint::get_section_xml($doc_obj, $docxml_output_options);
263 print $docxml_outhandler $section_text;
264
265
[32518]266 # We also write out whatever needs to go into the MySQL database
267 $self->write_meta_and_text($doc_obj);
268
[32520]269
[32583]270 # 3. post save out
[32526]271 $self->SUPER::post_saveas($doc_obj, $doc_dir, $docxml_outhandler, $output_file);
[32521]272
[32523]273
[32595]274 # database connection is closed once, in end() method
275 # We're not opening and closing over and over for each doc during a single build
[32518]276}
277
[32583]278##### New methods, not inherited #####
[32518]279
[32520]280# write meta and/or text PER DOC out to DB
[32518]281sub write_meta_and_text {
282 my $self = shift (@_);
283 my ($doc_obj) = @_;
[32531]284 my $doc_oid = $doc_obj->get_OID(); # this method processes a single doc at a time, so it uses the same OID throughout
[32518]285 my $root_section = $doc_obj->get_top_section();
286
[32573]287 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, $root_section);
[32521]288}
[32518]289
290sub recursive_write_meta_and_text {
[32520]291 my $self = shift (@_);
[32573]292 my ($doc_obj, $doc_oid, $section) = @_;
[32520]293
294 # If section=ROOT, write "root" as section name into table
295 # doc->get_top_section() is the name of the doc root section, which is ""
296 my $section_name = ($section eq "") ? "root" : $section;
[32518]297
298 my $section_ptr = $doc_obj->_lookup_section ($section);
299 return "" unless defined $section_ptr;
300
[32520]301 my $debug_out = $self->{'debug_outhandle'};
[32573]302
303 my $gs_sql = $self->{'gs_sql'};
304 my $proc_mode = $self->{'process_mode'};
305 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
[32520]306
[32640]307 ##binmode(STDERR, ":utf8"); # shouldn't be necessary as we call &unicode::utf8decomp() to avoid wide-character warnings by printing wide chars as unicode codepoints
308 #print STDERR "###### dc.Title: ".&unicode::utf8decomp($doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title"))."\n";
309
[32520]310 foreach my $data (@{$section_ptr->{'metadata'}}) {
311 my $meta_name = $data->[0];
[32591]312
313 # Treat db like a text file instead of an html/xml file: don't need to escape text
[32595]314 # going into it, unlike with doc(sql).xml
[32591]315 my $meta_value = $data->[1];
[32640]316
[32520]317 # Write out the current section's meta to collection db's METADATA table
318
[32573]319 # For each set of values to write to meta table, this next method call will
320 # efficiently execute an insert SQL statement (using a prepared insert statement),
321 # filling in the values
322 # OR if debugging, then it will print the SQL insert statement but not execute it
[32595]323 # (a behaviour following what the GS XML Plugout superclass does on debug)
[32580]324
[32591]325 $gs_sql->insert_row_into_metadata_table($doc_oid, $section_name, $meta_name, $meta_value, $self->{'debug'});
[32580]326 }
[32518]327 }
[32520]328
[32573]329
330 if($proc_mode eq "all" || $proc_mode eq "text_only" ) {
[32591]331
332 # See above, no need to html-escape for db
[32595]333 my $section_text = $section_ptr->{'text'};
[32573]334
[32595]335 # fulltxt column can be SQL NULL. undef value for $section_text gets written out as NULL:
[32573]336 # https://stackoverflow.com/questions/12708633/which-one-represents-null-undef-or-empty-string
337 # The following will do the SQL insertion
338 # or if debug, the following will print the SQL insert stmt without executing it
[32591]339 $gs_sql->insert_row_into_fulltxt_table($doc_oid, $section_name, \$section_text, $self->{'debug'});
[32573]340
[32518]341 }
342
343 # output all subsections: RECURSIVE CALL
344 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
[32573]345 $self->recursive_write_meta_and_text($doc_obj, $doc_oid, "$section.$subsection");
[32518]346 }
347}
348
349
3501;
Note: See TracBrowser for help on using the repository browser.