Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

unicode.pm

Last change on this file was 35165, checked in by kjdon, 3 years ago
updated to and from utf8 methods to handle 4 byte utf8 characters - emojis are in this space.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.2 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use FileUtils;
39	use MIME::Base64; # for base64 encoding
40
41	no strict 'refs';
42
43
44
45	sub utf8decomp
46	{
47	my ($str) = @_;
48
49	return if (!defined $str);
50	return "" if ($str eq "");
51
52	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54	my @each_char
55	= map { ($_ > 255 )
56	? # if wide character...
57	sprintf("\\x{%04X}", $_)
58	: # \x{...}
59	(chr($_) =~ m/[[:cntrl:]]/ )
60	? # else if control character ...
61	sprintf("\\x%02X", $_)
62	: # \x..
63	quotemeta(chr($_)) # else quoted or as themselves
64	} @unpacked_chars;
65
66	return join("",@each_char);
67	}
68
69
70	sub hex_codepoint {
71	if (my $char = shift) {
72	return sprintf '%2.2x', unpack('U0U*', $char);
73	}
74	}
75
76
77
78
79	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80	# and returns a unicode array.
81	sub ascii2unicode {
82	my ($in) = @_;
83	my $out = [];
84
85	my $i = 0;
86	my $len = length($in);
87	while ($i < $len) {
88	push (@$out, ord(substr ($in, $i, 1)));
89	$i++;
90	}
91
92	return $out;
93	}
94
95	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96	# UTF-8 encoded string. This is just a faster version of
97	# "&unicode2utf8(&ascii2unicode($str));"
98	# "Extended ascii" really means "iso_8859_1"
99	sub ascii2utf8 {
100	my ($in) = @_;
101	my $out = "";
102
103	if (!defined($in)\|\| !defined($$in)) {
104	return $out;
105	}
106
107	my ($c);
108	my $i = 0;
109	my $len = length($$in);
110	while ($i < $len) {
111	$c = ord (substr ($$in, $i, 1));
112	if ($c < 0x80) {
113	# ascii character
114	$out .= chr ($c);
115
116	} else {
117	# extended ascii character
118	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
119	$out .= chr (0x80 + ($c & 0x3f));
120	}
121	$i++;
122	}
123
124	return $out;
125	}
126
127	# utf-8 SCHEME:
128	#Char. number range \| UTF-8 bytes/octets sequence
129	# (hexadecimal) \| (binary)
130	#--------------------+------------------------------------
131	#0000 0000 - 0000 007F \| 0xxxxxxx
132	#0000 0080 - 0000 07FF \| 110xxxxx 10xxxxxx
133	#0000 0800 - 0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
134	#0001 0000 - 0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
135
136
137	# 4 byte code from a web page https://www.perl.com/article/building-a-utf-8-encoder-in-perl/
138	# unicode2utf8 takes a unicode array as input and encodes it
139	# using utf-8
140	sub unicode2utf8 {
141	my ($in) = @_;
142	my $out = "";
143
144	foreach my $num (@$in) {
145	next unless defined $num;
146	if ($num < 0x80) {# 10 000 000
147	# no transformation needed
148	$out .= chr ($num);
149
150	} elsif ($num < 0x800) {# 100 000 000 000 (0x80 to 0x7ff)
151	# populate bitmask 110xxxxx 10xxxxxx with the code point
152	$out .= chr (0xc0 + (($num >> 6) & 0x1f)); # top 5 bits go in first byte ($num >> 6 \| 0b11000000)
153	$out .= chr (0x80 + ($num & 0x3f)); # bottom 6 bit go in second byte ($num & 0b00111111 \| 0b10000000)
154
155	} elsif ($num < 0x10000) {# 10 000 000 000 000 000 (0x800 to 0xffff)
156	# populate bitmask 1110xxxx 10xxxxxx 10xxxxxx
157	$out .= chr (0xe0 + (($num >> 12) & 0xf)); # top 4 bits ($num >> 12 \| 0b11100000)
158	$out .= chr (0x80 + (($num >> 6) & 0x3f)); # next 6 bits ($num >> 6 & 0b00111111 \| 0b10000000)
159	$out .= chr (0x80 + ($num & 0x3f)); # last 6 bits ($num & 0b00111111 \| 0b10000000)
160
161	} else { # (0x10000 to 0x10ffff)
162	#populate bitmask 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
163	$out .= chr(($num >> 18 )\|0b11110000);
164	$out .= chr(($num >> 12) & 0b00111111 \| 0b10000000);
165	$out .= chr(($num >> 6 ) & 0b00111111 \| 0b10000000);
166	$out .= chr ($num & 0b00111111 \| 0b10000000);
167
168	}
169	}
170	return $out;
171	}
172
173
174	# utf82unicode takes a utf-8 string and produces a unicode
175	# array
176	sub utf82unicode {
177	my ($in) = @_;
178	my $out = [];
179
180	if(!defined $in) {
181	return $out;
182	}
183
184	my $i = 0;
185	my ($c1, $c2, $c3, $c4);
186	my $len = length($in);
187	while ($i < $len) {
188	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
189	# normal ascii character
190	push (@$out, $c1);
191
192	} elsif ($c1 < 0xc0) {
193	# error, was expecting the first byte of an
194	# encoded character. Do nothing.
195
196	} elsif ($c1 < 0xe0 && $i+1 < $len) {
197	# an encoded character with two bytes
198	$c2 = ord (substr ($in, $i+1, 1));
199	if ($c2 >= 0x80 && $c2 < 0xc0) {
200	# everything looks ok
201	push (@$out, ((($c1 & 0x1f) << 6) +
202	($c2 & 0x3f)));
203	$i++; # gobbled an extra byte
204	}
205
206	} elsif ($c1 < 0xf0 && $i+2 < $len) {
207	# an encoded character with three bytes
208	$c2 = ord (substr ($in, $i+1, 1));
209	$c3 = ord (substr ($in, $i+2, 1));
210	if ($c2 >= 0x80 && $c2 < 0xc0 &&
211	$c3 >= 0x80 && $c3 < 0xc0) {
212	# everything looks ok
213	push (@$out, ((($c1 & 0xf) << 12) +
214	(($c2 & 0x3f) << 6) +
215	($c3 & 0x3f)));
216
217	$i += 2; # gobbled an extra two bytes
218	}
219
220	} elsif ($c1 >= 0xf0 && $i+3 < $len) {
221	# an encoded character with four bytes
222	$c2 = ord (substr ($in, $i+1, 1));
223	$c3 = ord (substr ($in, $i+2, 1));
224	$c4 = ord (substr ($in, $i+2, 1));
225	if ($c2 >= 0x80 && $c2 < 0xc0 &&
226	$c3 >= 0x80 && $c3 < 0xc0 &&
227	$c4 >= 0x80 && $c4 < 0xc0) {
228	# everything looks ok
229	push (@$out, ((($c1 & 0x7) << 18) +
230	(($c2 & 0x3f) << 12) +
231	(($c3 & 0x3f) << 6) +
232	($c4 & 0x3f)));
233	$i+= 3; # gobbled an extra 3 bytes
234	}
235	} else {
236
237	# error
238	# Do nothing.
239	}
240
241	$i++;
242	}
243
244	return $out;
245	}
246
247	# unicode2ucs2 takes a unicode array and produces a UCS-2
248	# unicode string (every two bytes forms a unicode character)
249	sub unicode2ucs2 {
250	my ($in) = @_;
251	my $out = "";
252
253	foreach my $num (@$in) {
254	$out .= chr (($num & 0xff00) >> 8);
255	$out .= chr ($num & 0xff);
256	}
257
258	return $out;
259	}
260
261	# ucs22unicode takes a UCS-2 string and produces a unicode array
262	sub ucs22unicode {
263	my ($in) = @_;
264	my $out = [];
265
266	my $i = 0;
267	my $len = length ($in);
268	while ($i+1 < $len) {
269	push (@$out, ord (substr($in, $i, 1)) << 8 +
270	ord (substr($in, $i+1, 1)));
271
272	$i ++;
273	}
274
275	return $out;
276	}
277
278	# takes a reference to a string and returns a reference to a unicode array
279	sub convert2unicode {
280	my ($encoding, $textref) = @_;
281
282	if (!defined $encodings::encodings->{$encoding}) {
283	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
284	return [];
285	}
286
287	my $encodename = "$encoding-unicode";
288	my $enc_info = $encodings::encodings->{$encoding};
289	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
290	"to_uc", $enc_info->{'mapfile'});
291	if (!&loadmapencoding ($encodename, $mapfile)) {
292	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
293	return [];
294	}
295
296	if (defined $enc_info->{'converter'}) {
297	my $converter = $enc_info->{'converter'};
298	return &$converter ($encodename, $textref);
299	}
300
301	if ($unicode::translations{$encodename}->{'count'} == 1) {
302	return &singlebyte2unicode ($encodename, $textref);
303	} else {
304	return &doublebyte2unicode ($encodename, $textref);
305	}
306	}
307
308	# singlebyte2unicode converts simple 8 bit encodings where characters below
309	# 0x80 are normal ascii characters and the rest are decoded using the
310	# appropriate mapping files.
311	#
312	# Examples of encodings that may be converted using singlebyte2unicode are
313	# the iso-8859 and windows-125* series.
314	sub singlebyte2unicode {
315	my ($encodename, $textref) = @_;
316
317	my @outtext = ();
318	my $len = length($$textref);
319	my ($c);
320	my $i = 0;
321
322	while ($i < $len) {
323	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
324	# normal ascii character
325	push (@outtext, $c);
326	} else {
327	$c = &transchar ($encodename, $c);
328	# put a black square if cannot translate
329	$c = 0x25A1 if $c == 0;
330	push (@outtext, $c);
331	}
332	$i ++;
333	}
334	return \@outtext;
335	}
336
337	# doublebyte2unicode converts simple two byte encodings where characters
338	# below code point 0x80 are single-byte characters and the rest are
339	# double-byte characters.
340	#
341	# Examples of encodings that may be converted using doublebyte2unicode are
342	# CJK encodings like GB encoded Chinese and UHC Korean.
343	#
344	# Note that no error checking is performed to make sure that the input text
345	# is valid for the given encoding.
346	#
347	# Also, encodings that may contain characters of more than two bytes are
348	# not supported (any EUC encoded text may in theory contain 3-byte
349	# characters but in practice only one and two byte characters are used).
350	sub doublebyte2unicode {
351	my ($encodename, $textref) = @_;
352
353	my @outtext = ();
354	my $len = length($$textref);
355	my ($c1, $c2);
356	my $i = 0;
357
358	while ($i < $len) {
359	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
360	if ($i+1 < $len) {
361	# double-byte character
362	$c2 = ord(substr($$textref, $i+1, 1));
363	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
364	# put a black square if cannot translate
365	$c = 0x25A1 if $c == 0;
366	push (@outtext, $c);
367	$i += 2;
368
369	} else {
370	# error
371	print STDERR "unicode: ERROR missing second half of double-byte character\n";
372	$i++;
373	}
374
375	} else {
376	# single-byte character
377	push (@outtext, $c1);
378	$i++;
379	}
380	}
381	return \@outtext;
382	}
383
384	# Shift-JIS to unicode
385	# We can't use doublebyte2unicode for Shift-JIS because it uses some
386	# single-byte characters above code point 0x80 (i.e. half-width katakana
387	# characters in the range 0xA1-0xDF)
388	sub shiftjis2unicode {
389	my ($encodename, $textref) = @_;
390
391	my @outtext = ();
392	my $len = length($$textref);
393	my ($c1, $c2);
394	my $i = 0;
395
396	while ($i < $len) {
397	$c1 = ord(substr($$textref, $i, 1));
398
399	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
400	# Single-byte half-width katakana character or
401	# JIS Roman yen or overline characters
402	my $c = &transchar ($encodename, $c1);
403	# - put a black square if cannot translate
404	$c = 0x25A1 if $c == 0;
405	push (@outtext, $c);
406	$i++;
407
408	} elsif ($c1 < 0x80) {
409	# ASCII
410	push (@outtext, $c1);
411	$i ++;
412
413	} elsif ($c1 < 0xEF) {
414	if ($i+1 < $len) {
415	$c2 = ord(substr($$textref, $i+1, 1));
416	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
417	# Double-byte shift-jis character
418	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
419	# put a black square if cannot translate
420	$c = 0x25A1 if $c == 0;
421	push (@outtext, $c);
422	} else {
423	# error
424	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
425	}
426	$i += 2;
427	} else {
428	# error
429	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
430	$i ++;
431	}
432	} else {
433	# error
434	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
435	$i ++;
436	}
437	}
438	return \@outtext;
439	}
440
441	sub transchar {
442	my ($encoding, $from) = @_;
443	my $high = ($from / 256) % 256;
444	my $low = $from % 256;
445
446	return 0 unless defined $unicode::translations{$encoding};
447
448	my $block = $unicode::translations{$encoding}->{'map'};
449
450	if (ref ($block->[$high]) ne "ARRAY") {
451	return 0;
452	}
453	return $block->[$high]->[$low];
454	}
455
456	# %translations is of the form:
457	#
458	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
459	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
460	#
461	# Any of the top translation blocks can point to an undefined
462	# value. This data structure aims to allow fast translation and
463	# efficient storage.
464	%unicode::translations = ();
465
466	# @array256 is used for initialisation, there must be
467	# a better way...
468	# What about this?: @array256 = (0) x 256;
469	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
470	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
471	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
472	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
473	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
474	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
475	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
476	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
477	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
478	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
479	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
480	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
481	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
482	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
483	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
484	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
485
486	# returns 1 if successful, 0 if unsuccessful
487	sub loadmapencoding {
488	my ($encoding, $mapfile) = @_;
489
490	# check to see if the encoding has already been loaded
491	return 1 if (defined $unicode::translations{$encoding});
492
493	if (! -r $mapfile \|\| -d $mapfile) {
494	return 0;
495	}
496	return 0 unless open (MAPFILE, $mapfile);
497	binmode (MAPFILE);
498
499	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
500	my $block = $unicode::translations{$encoding};
501
502	my ($in,$i,$j);
503	while (1) {
504	my $ret=read(MAPFILE, $in, 1);
505	if (!defined($ret)) { # error
506	print STDERR "unicode.pm: error reading mapfile: $!\n";
507	last;
508	}
509	if ($ret != 1) { last }
510	$i = unpack ("C", $in);
511	$block->{'map'}->[$i] = [@unicode::array256];
512	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
513	my ($n1, $n2) = unpack ("CC", $in);
514	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
515	}
516	$block->{'count'} ++;
517	}
518
519	close (MAPFILE);
520	}
521
522	# unicode2singlebyte converts unicode to simple 8 bit encodings where
523	# characters below 0x80 are normal ascii characters and the rest are encoded
524	# using the appropriate mapping files.
525	#
526	# Examples of encodings that may be converted using unicode2singlebyte are
527	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
528	sub unicode2singlebyte {
529	my ($uniref, $encoding) = @_;
530
531	my $outtext = "";
532	my $encodename = "unicode-$encoding";
533
534	if (!exists $encodings::encodings->{$encoding}) {
535	print STDERR "unicode.pm: ERROR - unsupported encoding "
536	. "'$encoding' requested\n";
537	return "";
538	}
539
540	my $enc_info = $encodings::encodings->{$encoding};
541	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
542	"from_uc", $enc_info->{'mapfile'});
543	if (!&loadmapencoding ($encodename, $mapfile)) {
544	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
545	return "";
546	}
547
548	foreach my $c (@$uniref) {
549	if ($c < 0x80) {
550	# normal ascii character
551	$outtext .= chr($c);
552	} else {
553	# extended ascii character
554	$c = &transchar ($encodename, $c);
555
556	# put a question mark if cannot translate
557	if ($c == 0) {
558	$outtext .= "?";
559	} else {
560	$outtext .= chr($c);
561	}
562	}
563	}
564	return $outtext;
565	}
566
567
568	# this makes sure that the referenced input string is utf8 encoded, and
569	# will change/remove bytes that aren't.
570	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
571	sub ensure_utf8 {
572	my $stringref=shift;
573
574	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
575	return $stringref;
576	}
577
578	my $value=$$stringref;
579
580	my $non_utf8_found = 0;
581	$value =~ m/^/g; # to set \G
582	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
583	my $highbytes=$1;
584	my $highbyteslength=length($highbytes);
585	# make sure this block of high bytes is utf-8
586	$highbytes =~ /^/g; # set pos()
587	my $byte_replaced = 0;
588	while ($highbytes =~
589	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
590	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
591	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
592	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
593	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
594	)*([\x80-\xff])? !xg
595	) {
596	# this highbyte is "out-of-place" for valid utf-8
597	my $badbyte=$1;
598	if (!defined $badbyte) {next} # hit end of string
599	my $pos=pos($highbytes);
600	# replace bad byte. assume iso-8859-1 -> utf-8
601	# ascii2utf8 does "extended ascii"... ie iso-8859-1
602	my $replacement=&unicode::ascii2utf8(\$badbyte);
603	substr($highbytes, $pos-1, 1, $replacement);
604	# update the position to continue searching (for \G)
605	pos($highbytes) = $pos+length($replacement)-1;
606	$byte_replaced = 1;
607	}
608	if ($byte_replaced) {
609	# replace this block of high bytes in the $value
610	$non_utf8_found = 1;
611	my $replength=length($highbytes); # we've changed the length
612	my $textpos=pos($value); # pos at end of last match
613	# replace bad bytes with good bytes
614	substr($value, $textpos-$highbyteslength,
615	$highbyteslength, $highbytes);
616	# update the position to continue searching (for \G)
617	pos($value)=$textpos+($replength-$highbyteslength)+1;
618	}
619	}
620
621	$$stringref = $value;
622	return $non_utf8_found;
623	}
624
625	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
626	# Does not modify the string parameter.
627	sub check_is_utf8 {
628	my $value=shift;
629
630	if (!defined($value)) {
631	return 0; # not utf8 because it is undefined
632	}
633
634	$value =~ m/^/g; # to set \G
635	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
636	my $highbytes=$1;
637	# make sure this block of high bytes is utf-8
638	$highbytes =~ /^/g; # set pos()
639	while ($highbytes =~
640	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
641	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
642	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
643	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
644	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
645	)*([\x80-\xff])? !xg
646	) {
647	my $badbyte=$1;
648	if (defined $badbyte) { # not end of string
649	return 0; # non-utf8 found
650	}
651	}
652	}
653
654	return 1;
655	}
656
657	sub url_encode {
658	my ($text) = @_;
659
660	if (!&is_url_encoded($text)) {
661	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
662	# return the url-encoded character entity for underscore back to the entity
663	$text =~ s/%26%23095%3B/&\#095;/g;
664	}
665	return $text;
666	}
667
668	sub url_decode {
669	my ($text,$and_numeric_entities) = @_;
670
671	if(defined $text) {
672	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
673
674	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
675	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
676	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
677	}
678	}
679
680	return $text;
681	}
682
683	sub url_decode_utf8 {
684	my ($text,$and_numeric_entities) = @_;
685
686	$text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
687
688	$text = Encode::decode("utf8",$text);
689
690	return $text;
691	}
692
693	sub is_url_encoded {
694	my ($text) = @_;
695	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
696	}
697
698	# When a filename on the filesystem is already URL-encoded, the
699	# URL to it will have %25s in place of every % sign, so that
700	# URLs in html pages can refer to the URL-encoded filename.
701	# This method changes the URL reference back into the actual
702	# (URL-encoded) filename on the filesystem by replacing %25 with %.
703	sub url_to_filename {
704	my ($text) =@_;
705	$text =~ s/%25/%/g if &is_url_encoded($text);
706	# DM safing would have replaced underscores with character entity _
707	# in SourceFile meta. Undo any such change to get the filename referred to.
708	$text =~ s/&\#095;/_/g;
709	return $text;
710	}
711
712	# When a filename on the filesystem is already URL-encoded, the
713	# URL to it will have %25s in place of every % sign, so that
714	# URLs in html pages can refer to the URL-encoded filename.
715	# Given a (URL-encoded) filename on the filesystem, this subroutine
716	# returns the URL reference string for it by replacing % with %25.
717	# The output string will be the same as the input string if the input
718	# already contains one or more %25s. This is to prevent processing
719	# a url more than once this way.
720	sub filename_to_url {
721	my ($text) = @_;
722
723	if($text !~ m/%25/) {
724	$text =~ s/%/%25/g;
725	}
726	return $text;
727	}
728
729	# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
730	# If the $force_encode parameter is true, then this method WILL base64 encode whatever
731	# string is passed in, including any plain ASCII string.
732	# That means this method could double encode an already encoded string.
733	# However, this method is necessary because on the GLI end, we can't detect whether a plain
734	# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
735	# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
736	# (except the "import" prefix, which is special) or none of the parts.
737	sub force_base64_encode {
738	my ($text) = @_;
739	my $force_encode = 1;
740	my $no_newlines = 1;
741	return &base64_encode($text, $force_encode, $no_newlines);
742	}
743
744	sub base64_encode {
745	my ($text, $force_encode, $no_newlines) = @_;
746	if($force_encode \|\| !&conforms_to_mod_base64($text)) {
747	# return entity for underscore to underscore before encoding
748	$text =~ s/&\#095;/_/g;
749
750	$text = &MIME::Base64::encode_base64($text);
751	# base64 encoding may introduce + and / signs,
752	# replacing them with - and _ to ensure it's filename-safe
753	$text =~ s/\+/\-/g; # + -> -
754	$text =~ s/\//\_/g; # / -> _
755
756	# by default base64 encoding a long string introduces newlines to break long strings,
757	# see https://stackoverflow.com/questions/19952621/is-it-ok-to-remove-newline-in-base64-encoding
758	# Java doesn't like that
759	$text =~ s/\n//gs if $no_newlines;
760	}
761	return $text;
762	}
763
764	# If the input fits the modified base64 pattern, this will try decoding it.
765	# Still, this method does not guarantee the return value is the 'original', only
766	# that the result is where the base64 decoding process has been applied once.
767	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
768	sub base64_decode {
769	my ($text) = @_;
770	if(&conforms_to_mod_base64($text)) {
771	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
772	# have replaced them with - and _ respectively. Undo this before decoding.
773	$text =~ s/\-/\+/g; # - -> +
774	$text =~ s/\_/\//g; # _ -> /
775	$text = &MIME::Base64::decode_base64($text);
776	}
777	return $text;
778	}
779
780	# Returns true if the given string is compatible with a modified version
781	# of base64 (where the + and / are replaced with - and _), a format which
782	# includes also regular ASCII alphanumeric values. This method does not
783	# guarantee that the given string is actually base64 encoded, since it will
784	# return true for any simple alphanumeric ASCII string as well.
785	sub conforms_to_mod_base64 {
786	my ($text) = @_;
787
788	# need to treat the entity ref for underscore as underscore
789	$text =~ s/&\#095;/_/g;
790
791	# base 64 takes alphanumeric and [=+/],
792	# but we use modified base64 where + and / are replaced with - and _
793	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
794	}
795
796	sub substr
797	{
798	my ($utf8_string, $offset, $length) = @_;
799
800	my @unicode_string = @{&utf82unicode($utf8_string)};
801	my $unicode_string_length = scalar(@unicode_string);
802
803	my $substr_start = $offset;
804	if ($substr_start >= $unicode_string_length) {
805	return "";
806	}
807
808	my $substr_end = $offset + $length - 1;
809	if ($substr_end >= $unicode_string_length) {
810	$substr_end = $unicode_string_length - 1;
811	}
812
813	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
814	return &unicode2utf8(\@unicode_substring);
815	}
816
817	# perl version of stringToHex
818	# Useful method to print UTF8 (or other unicode) for debugging.
819	# Characters that are easily displayed (that is, printable ASCII)
820	# are shown as-is, whereas hex values of the unicode code points
821	# are shown for all other chars.
822	sub debug_unicode_string
823	{
824	join("",
825	map { $_ > 127 ? # if wide character...
826	sprintf("\\x{%04X}", $_) : # \x{...}
827	chr($_)
828	} unpack("U*", $_[0])); # unpack Unicode characters
829	}
830
831	sub raw_filename_to_url_encoded
832	{
833	my ($str_in) = @_;
834
835	my @url_encoded_chars
836	= map { $_ > 255 ? # Needs to be represent in entity form
837	sprintf("&#x%X;",$_) :
838	$_>127 \|\| $_==ord("%") ? # Representable in %XX form
839	sprintf("%%%2X", $_) :
840	chr($_) # otherwise, Ascii char
841	} unpack("U*", $str_in); # Unpack Unicode characters
842
843
844	my $str_out = join("", @url_encoded_chars);
845
846	return $str_out;
847
848	}
849
850	sub url_encoded_to_raw_filename
851	{
852	my ($str_in) = @_;
853
854	my $str_out = $str_in;
855
856	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
857	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
858	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
859
860	return $str_out;
861	}
862
863
864	sub raw_filename_to_utf8_url_encoded
865	{
866	my ($str_in) = @_;
867
868	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
869
870	my @url_encoded_chars
871	= map { $_ > 127 ? # Representable in %XX form
872	sprintf("%%%2X", $_) :
873	chr($_) # otherwise, Ascii char
874	} unpack("U*", $str_in); # Unpack utf8 characters
875
876
877	my $str_out = join("", @url_encoded_chars);
878
879	return $str_out;
880
881	}
882
883	sub utf8_url_encoded_to_raw_filename
884	{
885	my ($str_in) = @_;
886
887	my $utf8_str_out = $str_in;
888
889	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
890
891	my $unicode_str_out = decode("utf8",$utf8_str_out);
892	my $raw_str_out = utf8::downgrade($unicode_str_out);
893
894	return $raw_str_out;
895	}
896
897	sub analyze_raw_string
898	{
899	my ($str_in) = @_;
900
901	my $uses_bytecodes = 0;
902	my $exceeds_bytecodes = 0;
903
904	map { $exceeds_bytecodes = 1 if ($_ >= 256);
905	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
906	} unpack("U*", $str_in); # Unpack Unicode characters
907
908	return ($uses_bytecodes,$exceeds_bytecodes);
909	}
910
911
912	sub convert_utf8_string_to_unicode_string
913	{
914	my $utf8_string = shift(@_);
915
916	my $unicode_string = "";
917	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
918	$unicode_string .= chr($unicode_value);
919	}
920	return $unicode_string;
921	}
922
923	sub convert_unicode_string_to_utf8_string
924	{
925	my $unicode_string = shift(@_);
926
927	my @unicode_array;
928	for (my $i = 0; $i < length($unicode_string); $i++) {
929	push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
930	}
931	return &unicode::unicode2utf8(\@unicode_array);
932	}
933
934
935	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm

Download in other formats: