- Timestamp:
- 2018-11-29T21:23:49+13:00 (5 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/gsmysql.pm
r32595 r32640 208 208 $params_map->{'db_encoding'} = $self->{'db_encoding'}; 209 209 $params_map->{'verbosity'} = $self->{'verbosity'}; 210 210 211 211 $self->{'db_handle'} = &_get_connection_instance($params_map); # getting singleton (class method) 212 212 if($self->{'db_handle'}) { … … 240 240 241 241 # For proper utf8 support in MySQL, encoding should be 'utf8mb4' as 'utf8' is insufficient 242 my $db_enc = "utf8mb4" if $params_map->{'db_encoding'} eq "utf8";242 my $db_enc = ($params_map->{'db_encoding'} eq "utf8") ? "utf8mb4" : $params_map->{'db_encoding'}; 243 243 244 244 # Params for connecting to MySQL … … 287 287 RaiseError => 0, # off by default, but being explicit 288 288 AutoCommit => $autocommit, 289 mysql_enable_utf8mb4 => 1 # tells MySQL to use (4 byte) UTF-8 for 290 # communication and tells DBD::mysql to use it to decode the data, 289 #mysql_enable_utf8mb4 => 1 # Unrecognised in Windows' Strawberry Perl 5.22. On Ubuntu perl 5.22 290 # if mysql_enable_utf8mb4 was set to 1 *during connect()*, this tells MySQL to use (4 byte) UTF-8 for 291 # communication AND tells DBD::mysql to use it to decode the data, 291 292 # see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug 292 293 }); … … 322 323 # (see https://www.perlmonks.org/?node_id=259456) 323 324 324 #my $stmt = "set NAMES '" . $db_enc . "'"; 325 #$dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # tells MySQL to use UTF-8 for communication 326 #$dbh->{mysql_enable_utf8mb4} = 1; # tells DBD::mysql to decode the data 325 # For Strawberry Perl 5.22, need to the following 2 steps here to take care of encoding issues 326 # The 2 steps are: set names utf8mb4; AND setting mysql_enable_utf8mb4 = 1 327 # Doing this in 2 steps here also works on Ubuntu perl 5.22, 328 # but that also supported setting mysql_enable_utf8mb4 to 1 during connect() 329 # to do both these steps in one go 330 my $stmt = "set NAMES '" . $db_enc . "'"; 331 $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # tells MySQL to use UTF-8 for communication 332 $dbh->{mysql_enable_utf8mb4} = 1; # tells DBD::mysql to decode the data 327 333 328 334 # if we're here, then connection succeeded, store handle -
main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm
r32595 r32640 37 37 use gsmysql; 38 38 39 39 #use unicode; 40 #use Encode; 40 41 41 42 ######################################################################################## … … 370 371 # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted 371 372 # doc oids will still be listed in the index. 372 $self->{'doc_obj'}->set_OID($oid);373 373 374 374 print STDERR " GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n" … … 385 385 my ($primary_key, $did, $sid, $metaname, $metaval) = @$row; 386 386 387 # don't allow duplicate toplevel OID, as can happen when a doc has been renamed and requires reindexing 388 # TODO: any other meta that should not be duplicated, but can have been changed between rebuilds so that we need to call set_meta instead of add_meta? 389 # e.g. FileSize, SourceFile. But Language can be multiple, so gs meta is not guaranteed to be unique either. Whereas of dc metadata we know 390 # that some if not all should allow multiple entires for the same meta name, e.g. dc.Author/dc.Creator 391 if($sid =~ m@^root@ && $metaname eq "Identifier") { 392 # doc_obj's toplevel Identifier metadata is a special case: 393 # it should have only one value, so instead of add_meta() that will allow old Identifier meta to linger 394 # Need to do set_meta(). We then break out of the loop, to prevent duplicates (older values from DB) to be inserted for Identifier into doc_obj 395 # Handles the case where a doc was renamed and rebuilding triggers re-indexing case: old identifier is now overwritten with new one 396 $self->{'doc_obj'}->set_OID($oid); # calls doc_obj->set_metadata_element(top_section, Identifier, $oid). Sets OID if one doesn't exist. 397 next; # ensures Identifier set only once, and ensure Identifier is set to current docOID for the doc, a.o.t. allowing it to be set to any expired docOID from before a doc got renamed. 398 } 399 400 # process all other metadata the normal way: 401 387 402 # get rid of the artificial "root" introduced in section id when saving to sql db 388 403 $sid =~ s@^root@@; … … 395 410 $doc_obj->add_utf8_metadata($sid, $metaname, $metaval); 396 411 } 412 413 # UTF8 debugging, e.g. if we have a with macron in dc.Title 414 ##binmode(STDERR, ":utf8"); # not necessary when calling utf8decomp to print wide characters as unicode codept, avoiding 'wide character' warnings. 415 #my $dcTitle = $doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title"); 416 #print STDERR "###### dc.Title: ".&unicode::utf8decomp($dcTitle)."\n"; 417 #print STDERR "###### DECODED dc.Title: ".&unicode::utf8decomp(&Encode::decode_utf8($dcTitle))."\n"; # decoding was needed for perl 5.18 418 397 419 print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 398 420 if $self->{'verbosity'} > 2; 399 421 } 400 422 401 423 if($proc_mode eq "all" || $proc_mode eq "text_only") { 402 424 # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table … … 429 451 430 452 # done reading into docobj from SQL db 431 453 432 454 # don't forget to clean up on close() in superclass 433 455 # It will get the doc_obj indexed then make it undef -
main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm
r32595 r32640 38 38 use DBI; # the central package for this plugout 39 39 40 #use unicode; 40 41 41 42 # This plugout does not output the metadata and/or fulltxt xml to a file, … … 304 305 if($proc_mode eq "all" || $proc_mode eq "meta_only" ) { 305 306 307 ##binmode(STDERR, ":utf8"); # shouldn't be necessary as we call &unicode::utf8decomp() to avoid wide-character warnings by printing wide chars as unicode codepoints 308 #print STDERR "###### dc.Title: ".&unicode::utf8decomp($doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title"))."\n"; 309 306 310 foreach my $data (@{$section_ptr->{'metadata'}}) { 307 311 my $meta_name = $data->[0]; … … 310 314 # going into it, unlike with doc(sql).xml 311 315 my $meta_value = $data->[1]; 312 316 313 317 # Write out the current section's meta to collection db's METADATA table 314 318
Note:
See TracChangeset
for help on using the changeset viewer.