Changeset 32640

Show
Ignore:
Timestamp:
29.11.2018 21:23:49 (2 weeks ago)
Author:
ak19
Message:

Important changes (and commented out debugging statements) to get charset encodings (utf8 or specifically utf8mb4 in perl mysql case) to work with GS SQL Plugs on Windows. First, Strawberry Perl 5.22 was required and hence committed before this since Strawberry Perl 5.18 came with older mysql DBD/DBI packages that didn't decode utf8 content in the database when content was retrieved. Strawberry Perl 5.22 came with newer versions of DBD and DBI that do this automatically, as has been the case in Linux testing where Ubuntu had Perl 5.22 with sufficiently new versions of the DBI/DBD mysql perl packages. The newer Perl and specifically the newer MySQL DBD/DBI packages required some important changes to the gsmysql.pm code in the way charset encoding stuff is configured. This should work on Linux too, as that already allowed 2 ways to configure DB encoding stuff. I chose the single-line version on Linux, no longer supported with the DBI/DBD upgrade that comes with our new Strawberry Perl 5.22, so opting for the 2 line version to setup the DB encoding stuff which works on Windows and should continue to work on Linux too (where I had tested it before settling on the single-line variant).

Location:
main/trunk/greenstone2/perllib
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gsmysql.pm

    r32595 r32640  
    208208    $params_map->{'db_encoding'} = $self->{'db_encoding'}; 
    209209    $params_map->{'verbosity'} = $self->{'verbosity'}; 
    210      
     210     
    211211    $self->{'db_handle'} = &_get_connection_instance($params_map); # getting singleton (class method) 
    212212    if($self->{'db_handle'}) { 
     
    240240     
    241241    # For proper utf8 support in MySQL, encoding should be 'utf8mb4' as 'utf8' is insufficient 
    242     my $db_enc = "utf8mb4" if $params_map->{'db_encoding'} eq "utf8"; 
     242    my $db_enc = ($params_map->{'db_encoding'} eq "utf8") ? "utf8mb4" : $params_map->{'db_encoding'}; 
    243243 
    244244    # Params for connecting to MySQL 
     
    287287                   RaiseError => 0, # off by default, but being explicit 
    288288                   AutoCommit => $autocommit, 
    289                    mysql_enable_utf8mb4 => 1 # tells MySQL to use (4 byte) UTF-8 for 
    290                    # communication and tells DBD::mysql to use it to decode the data,  
     289                   #mysql_enable_utf8mb4 => 1 # Unrecognised in Windows' Strawberry Perl 5.22. On Ubuntu perl 5.22 
     290                   # if mysql_enable_utf8mb4 was set to 1 *during connect()*, this tells MySQL to use (4 byte) UTF-8 for 
     291                   # communication AND tells DBD::mysql to use it to decode the data,  
    291292                   # see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug  
    292293               }); 
     
    322323    # (see https://www.perlmonks.org/?node_id=259456) 
    323324     
    324     #my $stmt = "set NAMES '" . $db_enc . "'"; 
    325     #$dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # tells MySQL to use UTF-8 for communication 
    326     #$dbh->{mysql_enable_utf8mb4} = 1; # tells DBD::mysql to decode the data 
     325    # For Strawberry Perl 5.22, need to the following 2 steps here to take care of encoding issues 
     326    # The 2 steps are: set names utf8mb4; AND setting mysql_enable_utf8mb4 = 1 
     327    # Doing this in 2 steps here also works on Ubuntu perl 5.22, 
     328    # but that also supported setting mysql_enable_utf8mb4 to 1 during connect() 
     329    # to do both these steps in one go 
     330    my $stmt = "set NAMES '" . $db_enc . "'"; 
     331    $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # tells MySQL to use UTF-8 for communication 
     332    $dbh->{mysql_enable_utf8mb4} = 1; # tells DBD::mysql to decode the data 
    327333     
    328334    # if we're here, then connection succeeded, store handle 
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32595 r32640  
    3737use gsmysql; 
    3838 
    39  
     39#use unicode; 
     40#use Encode; 
    4041 
    4142######################################################################################## 
     
    370371    # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted 
    371372    # doc oids will still be listed in the index. 
    372     $self->{'doc_obj'}->set_OID($oid); 
    373373     
    374374    print STDERR "   GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n" 
     
    385385        my ($primary_key, $did, $sid, $metaname, $metaval) = @$row; 
    386386         
     387        # don't allow duplicate toplevel OID, as can happen when a doc has been renamed and requires reindexing 
     388        # TODO: any other meta that should not be duplicated, but can have been changed between rebuilds so that we need to call set_meta instead of add_meta? 
     389        # e.g. FileSize, SourceFile. But Language can be multiple, so gs meta is not guaranteed to be unique either. Whereas of dc metadata we know 
     390        # that some if not all should allow multiple entires for the same meta name, e.g. dc.Author/dc.Creator 
     391        if($sid =~ m@^root@ && $metaname eq "Identifier") { 
     392            # doc_obj's toplevel Identifier metadata is a special case: 
     393            # it should have only one value, so instead of add_meta() that will allow old Identifier meta to linger 
     394            # Need to do set_meta(). We then break out of the loop, to prevent duplicates (older values from DB) to be inserted for Identifier into doc_obj 
     395            # Handles the case where a doc was renamed and rebuilding triggers re-indexing case: old identifier is now overwritten with new one 
     396            $self->{'doc_obj'}->set_OID($oid); # calls doc_obj->set_metadata_element(top_section, Identifier, $oid). Sets OID if one doesn't exist. 
     397            next; # ensures Identifier set only once, and ensure Identifier is set to current docOID for the doc, a.o.t. allowing it to be set to any expired docOID from before a doc got renamed.          
     398        } 
     399         
     400        # process all other metadata the normal way: 
     401         
    387402        # get rid of the artificial "root" introduced in section id when saving to sql db 
    388403        $sid =~ s@^root@@; 
     
    395410        $doc_obj->add_utf8_metadata($sid, $metaname, $metaval); 
    396411    } 
     412     
     413    # UTF8 debugging, e.g. if we have a with macron in dc.Title 
     414    ##binmode(STDERR, ":utf8"); # not necessary when calling utf8decomp to print wide characters as unicode codept, avoiding 'wide character' warnings. 
     415    #my $dcTitle = $doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title");  
     416    #print STDERR "###### dc.Title: ".&unicode::utf8decomp($dcTitle)."\n"; 
     417    #print STDERR "###### DECODED dc.Title: ".&unicode::utf8decomp(&Encode::decode_utf8($dcTitle))."\n"; # decoding was needed for perl 5.18 
     418     
    397419    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n" 
    398420        if $self->{'verbosity'} > 2; 
    399421    } 
    400      
     422     
    401423    if($proc_mode eq "all" || $proc_mode eq "text_only") { 
    402424    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table 
     
    429451     
    430452    # done reading into docobj from SQL db 
    431      
     453     
    432454    # don't forget to clean up on close() in superclass 
    433455    # It will get the doc_obj indexed then make it undef 
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32595 r32640  
    3838use DBI; # the central package for this plugout 
    3939 
     40#use unicode; 
    4041 
    4142# This plugout does not output the metadata and/or fulltxt xml to a file, 
     
    304305    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) { 
    305306     
     307    ##binmode(STDERR, ":utf8"); # shouldn't be necessary as we call &unicode::utf8decomp() to avoid wide-character warnings by printing wide chars as unicode codepoints 
     308    #print STDERR "###### dc.Title: ".&unicode::utf8decomp($doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title"))."\n"; 
     309     
    306310    foreach my $data (@{$section_ptr->{'metadata'}}) { 
    307311        my $meta_name = $data->[0]; 
     
    310314        # going into it, unlike with doc(sql).xml 
    311315        my $meta_value = $data->[1]; 
    312          
     316         
    313317        # Write out the current section's meta to collection db's METADATA table         
    314318