Changeset 34695


Ignore:
Timestamp:
2021-01-24T12:44:31+13:00 (3 years ago)
Author:
davidb
Message:

More careful treatment of UTF8 and Unicode-aware strings while beig processed as JSON. Addresses an issue with JSON syntax using \uNNNN format (literal ASCII chars) for unicode chars

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/model-sites-dev/eurovision-lod/collect/eurovision/perllib/plugins/JSONSPARQLResultPlugin.pm

    r34645 r34695  
    117117    my $section = $doc_obj->get_top_section();
    118118   
    119     # Add the raw line as the document text
    120     $doc_obj->add_utf8_text($section, $$textref);
     119    #my $result_entry = decode_json($$textref);
     120    # $textref is UTF8 byte encoded, not "Unicode-aware", so need to specify this in optional param to from_json()
     121    my $result_entry = from_json($$textref, {utf8 => 1} );
    121122
    122     my $result_entry = decode_json($$textref);
     123    # In JSON syntax, it seems permissible to represent Unicode characters in the form \uNNNN
     124    # These are not automatically converted into UTF8 chars by reading in the file as text
     125    # (it is tempting to think about putting such decoding into the read-text-file subroutine,
     126    #  but holding off on that idea for now, as could have wider unforseen implications)
     127    #
     128    # For now: The \uNNNN is dealt with by having the to_json() result re-encoded
     129    my $text_utf8 = to_json($result_entry, { pretty => 1} );
    123130
    124     # entrant{value,type=url}
     131    $doc_obj->add_utf8_metadata($section, "JSON", $text_utf8);
     132
     133    # Set the "text document" to be a pre-formatted, HTML friendly version of the JSON syntax
     134    $self->text_to_html(\$text_utf8);
     135    $doc_obj->add_utf8_text($section, $text_utf8);
     136
     137    # entrant{value,type=uri}
    125138    # song{value,type=uri}
    126139    # country{value,xml:lang=en,type=literal}
     
    134147        $doc_obj->add_utf8_metadata($section, $md_name, $md_val);
    135148    }
     149    elsif ($md_name_lod_rec->{'type'} eq "uri") {
     150        my $md_val = $md_name_lod_rec->{'value'};
     151        $doc_obj->add_utf8_metadata($section, $md_name."_uri", $md_val);
     152    }
    136153   
    137154    }
Note: See TracChangeset for help on using the changeset viewer.