Changeset 32640


Ignore:
Timestamp:
2018-11-29T21:23:49+13:00 (5 years ago)
Author:
ak19
Message:

Important changes (and commented out debugging statements) to get charset encodings (utf8 or specifically utf8mb4 in perl mysql case) to work with GS SQL Plugs on Windows. First, Strawberry Perl 5.22 was required and hence committed before this since Strawberry Perl 5.18 came with older mysql DBD/DBI packages that didn't decode utf8 content in the database when content was retrieved. Strawberry Perl 5.22 came with newer versions of DBD and DBI that do this automatically, as has been the case in Linux testing where Ubuntu had Perl 5.22 with sufficiently new versions of the DBI/DBD mysql perl packages. The newer Perl and specifically the newer MySQL DBD/DBI packages required some important changes to the gsmysql.pm code in the way charset encoding stuff is configured. This should work on Linux too, as that already allowed 2 ways to configure DB encoding stuff. I chose the single-line version on Linux, no longer supported with the DBI/DBD upgrade that comes with our new Strawberry Perl 5.22, so opting for the 2 line version to setup the DB encoding stuff which works on Windows and should continue to work on Linux too (where I had tested it before settling on the single-line variant).

Location:
main/trunk/greenstone2/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/gsmysql.pm

    r32595 r32640  
    208208    $params_map->{'db_encoding'} = $self->{'db_encoding'};
    209209    $params_map->{'verbosity'} = $self->{'verbosity'};
    210    
     210   
    211211    $self->{'db_handle'} = &_get_connection_instance($params_map); # getting singleton (class method)
    212212    if($self->{'db_handle'}) {
     
    240240   
    241241    # For proper utf8 support in MySQL, encoding should be 'utf8mb4' as 'utf8' is insufficient
    242     my $db_enc = "utf8mb4" if $params_map->{'db_encoding'} eq "utf8";
     242    my $db_enc = ($params_map->{'db_encoding'} eq "utf8") ? "utf8mb4" : $params_map->{'db_encoding'};
    243243
    244244    # Params for connecting to MySQL
     
    287287                   RaiseError => 0, # off by default, but being explicit
    288288                   AutoCommit => $autocommit,
    289                    mysql_enable_utf8mb4 => 1 # tells MySQL to use (4 byte) UTF-8 for
    290                    # communication and tells DBD::mysql to use it to decode the data,
     289                   #mysql_enable_utf8mb4 => 1 # Unrecognised in Windows' Strawberry Perl 5.22. On Ubuntu perl 5.22
     290                   # if mysql_enable_utf8mb4 was set to 1 *during connect()*, this tells MySQL to use (4 byte) UTF-8 for
     291                   # communication AND tells DBD::mysql to use it to decode the data,
    291292                   # see https://stackoverflow.com/questions/46727362/perl-mysql-utf8mb4-issue-possible-bug
    292293               });
     
    322323    # (see https://www.perlmonks.org/?node_id=259456)
    323324   
    324     #my $stmt = "set NAMES '" . $db_enc . "'";
    325     #$dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # tells MySQL to use UTF-8 for communication
    326     #$dbh->{mysql_enable_utf8mb4} = 1; # tells DBD::mysql to decode the data
     325    # For Strawberry Perl 5.22, need to the following 2 steps here to take care of encoding issues
     326    # The 2 steps are: set names utf8mb4; AND setting mysql_enable_utf8mb4 = 1
     327    # Doing this in 2 steps here also works on Ubuntu perl 5.22,
     328    # but that also supported setting mysql_enable_utf8mb4 to 1 during connect()
     329    # to do both these steps in one go
     330    my $stmt = "set NAMES '" . $db_enc . "'";
     331    $dbh->do($stmt) || warn("Unable to set charset encoding at db server level to: " . $db_enc . "\n"); # tells MySQL to use UTF-8 for communication
     332    $dbh->{mysql_enable_utf8mb4} = 1; # tells DBD::mysql to decode the data
    327333   
    328334    # if we're here, then connection succeeded, store handle
  • main/trunk/greenstone2/perllib/plugins/GreenstoneSQLPlugin.pm

    r32595 r32640  
    3737use gsmysql;
    3838
    39 
     39#use unicode;
     40#use Encode;
    4041
    4142########################################################################################
     
    370371    # browse the rebuilt collection by files/titles. So unless you set the OID here, the deleted
    371372    # doc oids will still be listed in the index.
    372     $self->{'doc_obj'}->set_OID($oid);
    373373   
    374374    print STDERR "   GreenstoneSQLPlugin processing doc $oid (reading into docobj from SQL db)\n"
     
    385385        my ($primary_key, $did, $sid, $metaname, $metaval) = @$row;
    386386       
     387        # don't allow duplicate toplevel OID, as can happen when a doc has been renamed and requires reindexing
     388        # TODO: any other meta that should not be duplicated, but can have been changed between rebuilds so that we need to call set_meta instead of add_meta?
     389        # e.g. FileSize, SourceFile. But Language can be multiple, so gs meta is not guaranteed to be unique either. Whereas of dc metadata we know
     390        # that some if not all should allow multiple entires for the same meta name, e.g. dc.Author/dc.Creator
     391        if($sid =~ m@^root@ && $metaname eq "Identifier") {
     392            # doc_obj's toplevel Identifier metadata is a special case:
     393            # it should have only one value, so instead of add_meta() that will allow old Identifier meta to linger
     394            # Need to do set_meta(). We then break out of the loop, to prevent duplicates (older values from DB) to be inserted for Identifier into doc_obj
     395            # Handles the case where a doc was renamed and rebuilding triggers re-indexing case: old identifier is now overwritten with new one
     396            $self->{'doc_obj'}->set_OID($oid); # calls doc_obj->set_metadata_element(top_section, Identifier, $oid). Sets OID if one doesn't exist.
     397            next; # ensures Identifier set only once, and ensure Identifier is set to current docOID for the doc, a.o.t. allowing it to be set to any expired docOID from before a doc got renamed.         
     398        }
     399       
     400        # process all other metadata the normal way:
     401       
    387402        # get rid of the artificial "root" introduced in section id when saving to sql db
    388403        $sid =~ s@^root@@;
     
    395410        $doc_obj->add_utf8_metadata($sid, $metaname, $metaval);
    396411    }
     412   
     413    # UTF8 debugging, e.g. if we have a with macron in dc.Title
     414    ##binmode(STDERR, ":utf8"); # not necessary when calling utf8decomp to print wide characters as unicode codept, avoiding 'wide character' warnings.
     415    #my $dcTitle = $doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title");
     416    #print STDERR "###### dc.Title: ".&unicode::utf8decomp($dcTitle)."\n";
     417    #print STDERR "###### DECODED dc.Title: ".&unicode::utf8decomp(&Encode::decode_utf8($dcTitle))."\n"; # decoding was needed for perl 5.18
     418   
    397419    print $outhandle "----------FIN READING DOC's META FROM SQL DB------------\n"
    398420        if $self->{'verbosity'} > 2;
    399421    }
    400    
     422   
    401423    if($proc_mode eq "all" || $proc_mode eq "text_only") {
    402424    # read in fulltxt for the collection (i.e. select * from <col>_fulltxt table
     
    429451   
    430452    # done reading into docobj from SQL db
    431    
     453   
    432454    # don't forget to clean up on close() in superclass
    433455    # It will get the doc_obj indexed then make it undef
  • main/trunk/greenstone2/perllib/plugouts/GreenstoneSQLPlugout.pm

    r32595 r32640  
    3838use DBI; # the central package for this plugout
    3939
     40#use unicode;
    4041
    4142# This plugout does not output the metadata and/or fulltxt xml to a file,
     
    304305    if($proc_mode eq "all" || $proc_mode eq "meta_only" ) {
    305306   
     307    ##binmode(STDERR, ":utf8"); # shouldn't be necessary as we call &unicode::utf8decomp() to avoid wide-character warnings by printing wide chars as unicode codepoints
     308    #print STDERR "###### dc.Title: ".&unicode::utf8decomp($doc_obj->get_metadata_element($doc_obj->get_top_section(), "dc.Title"))."\n";
     309   
    306310    foreach my $data (@{$section_ptr->{'metadata'}}) {
    307311        my $meta_name = $data->[0];
     
    310314        # going into it, unlike with doc(sql).xml
    311315        my $meta_value = $data->[1];
    312        
     316       
    313317        # Write out the current section's meta to collection db's METADATA table       
    314318       
Note: See TracChangeset for help on using the changeset viewer.