Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 34393

Last change on this file since 34393 was 34393, checked in by ak19, 4 years ago
Commit of independent code update before committing a bugfix. In this commit, the results of base64 encoding can have linebreaks, which aren't what we want or expect, so have a flag for removing them.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 24.8 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use FileUtils;
39	use MIME::Base64; # for base64 encoding
40
41	no strict 'refs';
42
43
44
45	sub utf8decomp
46	{
47	my ($str) = @_;
48
49	return if (!defined $str);
50	return "" if ($str eq "");
51
52	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54	my @each_char
55	= map { ($_ > 255 )
56	? # if wide character...
57	sprintf("\\x{%04X}", $_)
58	: # \x{...}
59	(chr($_) =~ m/[[:cntrl:]]/ )
60	? # else if control character ...
61	sprintf("\\x%02X", $_)
62	: # \x..
63	quotemeta(chr($_)) # else quoted or as themselves
64	} @unpacked_chars;
65
66	return join("",@each_char);
67	}
68
69
70	sub hex_codepoint {
71	if (my $char = shift) {
72	return sprintf '%2.2x', unpack('U0U*', $char);
73	}
74	}
75
76
77
78
79	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80	# and returns a unicode array.
81	sub ascii2unicode {
82	my ($in) = @_;
83	my $out = [];
84
85	my $i = 0;
86	my $len = length($in);
87	while ($i < $len) {
88	push (@$out, ord(substr ($in, $i, 1)));
89	$i++;
90	}
91
92	return $out;
93	}
94
95	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96	# UTF-8 encoded string. This is just a faster version of
97	# "&unicode2utf8(&ascii2unicode($str));"
98	# "Extended ascii" really means "iso_8859_1"
99	sub ascii2utf8 {
100	my ($in) = @_;
101	my $out = "";
102
103	if (!defined($in)\|\| !defined($$in)) {
104	return $out;
105	}
106
107	my ($c);
108	my $i = 0;
109	my $len = length($$in);
110	while ($i < $len) {
111	$c = ord (substr ($$in, $i, 1));
112	if ($c < 0x80) {
113	# ascii character
114	$out .= chr ($c);
115
116	} else {
117	# extended ascii character
118	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
119	$out .= chr (0x80 + ($c & 0x3f));
120	}
121	$i++;
122	}
123
124	return $out;
125	}
126
127	# unicode2utf8 takes a unicode array as input and encodes it
128	# using utf-8
129	sub unicode2utf8 {
130	my ($in) = @_;
131	my $out = "";
132
133	foreach my $num (@$in) {
134	next unless defined $num;
135	if ($num < 0x80) {
136	$out .= chr ($num);
137
138	} elsif ($num < 0x800) {
139	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
140	$out .= chr (0x80 + ($num & 0x3f));
141
142	} elsif ($num < 0xFFFF) {
143	$out .= chr (0xe0 + (($num >> 12) & 0xf));
144	$out .= chr (0x80 + (($num >> 6) & 0x3f));
145	$out .= chr (0x80 + ($num & 0x3f));
146
147	} else {
148	# error, don't encode anything
149	#die;
150	# Diego's bugfix: instead of aborting the import process, it
151	# is better to get a converted file with a few extra spaces
152	print STDERR "strange char: $num\n";
153	$out .= " ";
154
155	}
156	}
157	return $out;
158	}
159
160	# utf82unicode takes a utf-8 string and produces a unicode
161	# array
162	sub utf82unicode {
163	my ($in) = @_;
164	my $out = [];
165
166	if(!defined $in) {
167	return $out;
168	}
169
170	my $i = 0;
171	my ($c1, $c2, $c3);
172	my $len = length($in);
173	while ($i < $len) {
174	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175	# normal ascii character
176	push (@$out, $c1);
177
178	} elsif ($c1 < 0xc0) {
179	# error, was expecting the first byte of an
180	# encoded character. Do nothing.
181
182	} elsif ($c1 < 0xe0 && $i+1 < $len) {
183	# an encoded character with two bytes
184	$c2 = ord (substr ($in, $i+1, 1));
185	if ($c2 >= 0x80 && $c2 < 0xc0) {
186	# everything looks ok
187	push (@$out, ((($c1 & 0x1f) << 6) +
188	($c2 & 0x3f)));
189	$i++; # gobbled an extra byte
190	}
191
192	} elsif ($c1 < 0xf0 && $i+2 < $len) {
193	# an encoded character with three bytes
194	$c2 = ord (substr ($in, $i+1, 1));
195	$c3 = ord (substr ($in, $i+2, 1));
196	if ($c2 >= 0x80 && $c2 < 0xc0 &&
197	$c3 >= 0x80 && $c3 < 0xc0) {
198	# everything looks ok
199	push (@$out, ((($c1 & 0xf) << 12) +
200	(($c2 & 0x3f) << 6) +
201	($c3 & 0x3f)));
202
203	$i += 2; # gobbled an extra two bytes
204	}
205
206	} else {
207	# error, only decode Unicode characters not full UCS.
208	# Do nothing.
209	}
210
211	$i++;
212	}
213
214	return $out;
215	}
216
217	# unicode2ucs2 takes a unicode array and produces a UCS-2
218	# unicode string (every two bytes forms a unicode character)
219	sub unicode2ucs2 {
220	my ($in) = @_;
221	my $out = "";
222
223	foreach my $num (@$in) {
224	$out .= chr (($num & 0xff00) >> 8);
225	$out .= chr ($num & 0xff);
226	}
227
228	return $out;
229	}
230
231	# ucs22unicode takes a UCS-2 string and produces a unicode array
232	sub ucs22unicode {
233	my ($in) = @_;
234	my $out = [];
235
236	my $i = 0;
237	my $len = length ($in);
238	while ($i+1 < $len) {
239	push (@$out, ord (substr($in, $i, 1)) << 8 +
240	ord (substr($in, $i+1, 1)));
241
242	$i ++;
243	}
244
245	return $out;
246	}
247
248	# takes a reference to a string and returns a reference to a unicode array
249	sub convert2unicode {
250	my ($encoding, $textref) = @_;
251
252	if (!defined $encodings::encodings->{$encoding}) {
253	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254	return [];
255	}
256
257	my $encodename = "$encoding-unicode";
258	my $enc_info = $encodings::encodings->{$encoding};
259	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260	"to_uc", $enc_info->{'mapfile'});
261	if (!&loadmapencoding ($encodename, $mapfile)) {
262	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263	return [];
264	}
265
266	if (defined $enc_info->{'converter'}) {
267	my $converter = $enc_info->{'converter'};
268	return &$converter ($encodename, $textref);
269	}
270
271	if ($unicode::translations{$encodename}->{'count'} == 1) {
272	return &singlebyte2unicode ($encodename, $textref);
273	} else {
274	return &doublebyte2unicode ($encodename, $textref);
275	}
276	}
277
278	# singlebyte2unicode converts simple 8 bit encodings where characters below
279	# 0x80 are normal ascii characters and the rest are decoded using the
280	# appropriate mapping files.
281	#
282	# Examples of encodings that may be converted using singlebyte2unicode are
283	# the iso-8859 and windows-125* series.
284	sub singlebyte2unicode {
285	my ($encodename, $textref) = @_;
286
287	my @outtext = ();
288	my $len = length($$textref);
289	my ($c);
290	my $i = 0;
291
292	while ($i < $len) {
293	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294	# normal ascii character
295	push (@outtext, $c);
296	} else {
297	$c = &transchar ($encodename, $c);
298	# put a black square if cannot translate
299	$c = 0x25A1 if $c == 0;
300	push (@outtext, $c);
301	}
302	$i ++;
303	}
304	return \@outtext;
305	}
306
307	# doublebyte2unicode converts simple two byte encodings where characters
308	# below code point 0x80 are single-byte characters and the rest are
309	# double-byte characters.
310	#
311	# Examples of encodings that may be converted using doublebyte2unicode are
312	# CJK encodings like GB encoded Chinese and UHC Korean.
313	#
314	# Note that no error checking is performed to make sure that the input text
315	# is valid for the given encoding.
316	#
317	# Also, encodings that may contain characters of more than two bytes are
318	# not supported (any EUC encoded text may in theory contain 3-byte
319	# characters but in practice only one and two byte characters are used).
320	sub doublebyte2unicode {
321	my ($encodename, $textref) = @_;
322
323	my @outtext = ();
324	my $len = length($$textref);
325	my ($c1, $c2);
326	my $i = 0;
327
328	while ($i < $len) {
329	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330	if ($i+1 < $len) {
331	# double-byte character
332	$c2 = ord(substr($$textref, $i+1, 1));
333	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
334	# put a black square if cannot translate
335	$c = 0x25A1 if $c == 0;
336	push (@outtext, $c);
337	$i += 2;
338
339	} else {
340	# error
341	print STDERR "unicode: ERROR missing second half of double-byte character\n";
342	$i++;
343	}
344
345	} else {
346	# single-byte character
347	push (@outtext, $c1);
348	$i++;
349	}
350	}
351	return \@outtext;
352	}
353
354	# Shift-JIS to unicode
355	# We can't use doublebyte2unicode for Shift-JIS because it uses some
356	# single-byte characters above code point 0x80 (i.e. half-width katakana
357	# characters in the range 0xA1-0xDF)
358	sub shiftjis2unicode {
359	my ($encodename, $textref) = @_;
360
361	my @outtext = ();
362	my $len = length($$textref);
363	my ($c1, $c2);
364	my $i = 0;
365
366	while ($i < $len) {
367	$c1 = ord(substr($$textref, $i, 1));
368
369	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
370	# Single-byte half-width katakana character or
371	# JIS Roman yen or overline characters
372	my $c = &transchar ($encodename, $c1);
373	# - put a black square if cannot translate
374	$c = 0x25A1 if $c == 0;
375	push (@outtext, $c);
376	$i++;
377
378	} elsif ($c1 < 0x80) {
379	# ASCII
380	push (@outtext, $c1);
381	$i ++;
382
383	} elsif ($c1 < 0xEF) {
384	if ($i+1 < $len) {
385	$c2 = ord(substr($$textref, $i+1, 1));
386	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
387	# Double-byte shift-jis character
388	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
389	# put a black square if cannot translate
390	$c = 0x25A1 if $c == 0;
391	push (@outtext, $c);
392	} else {
393	# error
394	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395	}
396	$i += 2;
397	} else {
398	# error
399	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400	$i ++;
401	}
402	} else {
403	# error
404	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405	$i ++;
406	}
407	}
408	return \@outtext;
409	}
410
411	sub transchar {
412	my ($encoding, $from) = @_;
413	my $high = ($from / 256) % 256;
414	my $low = $from % 256;
415
416	return 0 unless defined $unicode::translations{$encoding};
417
418	my $block = $unicode::translations{$encoding}->{'map'};
419
420	if (ref ($block->[$high]) ne "ARRAY") {
421	return 0;
422	}
423	return $block->[$high]->[$low];
424	}
425
426	# %translations is of the form:
427	#
428	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430	#
431	# Any of the top translation blocks can point to an undefined
432	# value. This data structure aims to allow fast translation and
433	# efficient storage.
434	%unicode::translations = ();
435
436	# @array256 is used for initialisation, there must be
437	# a better way...
438	# What about this?: @array256 = (0) x 256;
439	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456	# returns 1 if successful, 0 if unsuccessful
457	sub loadmapencoding {
458	my ($encoding, $mapfile) = @_;
459
460	# check to see if the encoding has already been loaded
461	return 1 if (defined $unicode::translations{$encoding});
462
463	if (! -r $mapfile \|\| -d $mapfile) {
464	return 0;
465	}
466	return 0 unless open (MAPFILE, $mapfile);
467	binmode (MAPFILE);
468
469	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470	my $block = $unicode::translations{$encoding};
471
472	my ($in,$i,$j);
473	while (1) {
474	my $ret=read(MAPFILE, $in, 1);
475	if (!defined($ret)) { # error
476	print STDERR "unicode.pm: error reading mapfile: $!\n";
477	last;
478	}
479	if ($ret != 1) { last }
480	$i = unpack ("C", $in);
481	$block->{'map'}->[$i] = [@unicode::array256];
482	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483	my ($n1, $n2) = unpack ("CC", $in);
484	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485	}
486	$block->{'count'} ++;
487	}
488
489	close (MAPFILE);
490	}
491
492	# unicode2singlebyte converts unicode to simple 8 bit encodings where
493	# characters below 0x80 are normal ascii characters and the rest are encoded
494	# using the appropriate mapping files.
495	#
496	# Examples of encodings that may be converted using unicode2singlebyte are
497	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498	sub unicode2singlebyte {
499	my ($uniref, $encoding) = @_;
500
501	my $outtext = "";
502	my $encodename = "unicode-$encoding";
503
504	if (!exists $encodings::encodings->{$encoding}) {
505	print STDERR "unicode.pm: ERROR - unsupported encoding "
506	. "'$encoding' requested\n";
507	return "";
508	}
509
510	my $enc_info = $encodings::encodings->{$encoding};
511	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512	"from_uc", $enc_info->{'mapfile'});
513	if (!&loadmapencoding ($encodename, $mapfile)) {
514	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515	return "";
516	}
517
518	foreach my $c (@$uniref) {
519	if ($c < 0x80) {
520	# normal ascii character
521	$outtext .= chr($c);
522	} else {
523	# extended ascii character
524	$c = &transchar ($encodename, $c);
525
526	# put a question mark if cannot translate
527	if ($c == 0) {
528	$outtext .= "?";
529	} else {
530	$outtext .= chr($c);
531	}
532	}
533	}
534	return $outtext;
535	}
536
537
538	# this makes sure that the referenced input string is utf8 encoded, and
539	# will change/remove bytes that aren't.
540	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541	sub ensure_utf8 {
542	my $stringref=shift;
543
544	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
545	return $stringref;
546	}
547
548	my $value=$$stringref;
549
550	my $non_utf8_found = 0;
551	$value =~ m/^/g; # to set \G
552	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553	my $highbytes=$1;
554	my $highbyteslength=length($highbytes);
555	# make sure this block of high bytes is utf-8
556	$highbytes =~ /^/g; # set pos()
557	my $byte_replaced = 0;
558	while ($highbytes =~
559	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
560	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
561	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
562	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
563	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
564	)*([\x80-\xff])? !xg
565	) {
566	# this highbyte is "out-of-place" for valid utf-8
567	my $badbyte=$1;
568	if (!defined $badbyte) {next} # hit end of string
569	my $pos=pos($highbytes);
570	# replace bad byte. assume iso-8859-1 -> utf-8
571	# ascii2utf8 does "extended ascii"... ie iso-8859-1
572	my $replacement=&unicode::ascii2utf8(\$badbyte);
573	substr($highbytes, $pos-1, 1, $replacement);
574	# update the position to continue searching (for \G)
575	pos($highbytes) = $pos+length($replacement)-1;
576	$byte_replaced = 1;
577	}
578	if ($byte_replaced) {
579	# replace this block of high bytes in the $value
580	$non_utf8_found = 1;
581	my $replength=length($highbytes); # we've changed the length
582	my $textpos=pos($value); # pos at end of last match
583	# replace bad bytes with good bytes
584	substr($value, $textpos-$highbyteslength,
585	$highbyteslength, $highbytes);
586	# update the position to continue searching (for \G)
587	pos($value)=$textpos+($replength-$highbyteslength)+1;
588	}
589	}
590
591	$$stringref = $value;
592	return $non_utf8_found;
593	}
594
595	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596	# Does not modify the string parameter.
597	sub check_is_utf8 {
598	my $value=shift;
599
600	if (!defined($value)) {
601	return 0; # not utf8 because it is undefined
602	}
603
604	$value =~ m/^/g; # to set \G
605	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606	my $highbytes=$1;
607	# make sure this block of high bytes is utf-8
608	$highbytes =~ /^/g; # set pos()
609	while ($highbytes =~
610	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
611	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
612	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
613	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
614	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
615	)*([\x80-\xff])? !xg
616	) {
617	my $badbyte=$1;
618	if (defined $badbyte) { # not end of string
619	return 0; # non-utf8 found
620	}
621	}
622	}
623
624	return 1;
625	}
626
627	sub url_encode {
628	my ($text) = @_;
629
630	if (!&is_url_encoded($text)) {
631	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632	# return the url-encoded character entity for underscore back to the entity
633	$text =~ s/%26%23095%3B/&\#095;/g;
634	}
635	return $text;
636	}
637
638	sub url_decode {
639	my ($text,$and_numeric_entities) = @_;
640
641	if(defined $text) {
642	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643
644	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647	}
648	}
649
650	return $text;
651	}
652
653	sub url_decode_utf8 {
654	my ($text,$and_numeric_entities) = @_;
655
656	$text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658	$text = Encode::decode("utf8",$text);
659
660	return $text;
661	}
662
663	sub is_url_encoded {
664	my ($text) = @_;
665	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
666	}
667
668	# When a filename on the filesystem is already URL-encoded, the
669	# URL to it will have %25s in place of every % sign, so that
670	# URLs in html pages can refer to the URL-encoded filename.
671	# This method changes the URL reference back into the actual
672	# (URL-encoded) filename on the filesystem by replacing %25 with %.
673	sub url_to_filename {
674	my ($text) =@_;
675	$text =~ s/%25/%/g if &is_url_encoded($text);
676	# DM safing would have replaced underscores with character entity _
677	# in SourceFile meta. Undo any such change to get the filename referred to.
678	$text =~ s/&\#095;/_/g;
679	return $text;
680	}
681
682	# When a filename on the filesystem is already URL-encoded, the
683	# URL to it will have %25s in place of every % sign, so that
684	# URLs in html pages can refer to the URL-encoded filename.
685	# Given a (URL-encoded) filename on the filesystem, this subroutine
686	# returns the URL reference string for it by replacing % with %25.
687	# The output string will be the same as the input string if the input
688	# already contains one or more %25s. This is to prevent processing
689	# a url more than once this way.
690	sub filename_to_url {
691	my ($text) = @_;
692
693	if($text !~ m/%25/) {
694	$text =~ s/%/%25/g;
695	}
696	return $text;
697	}
698
699	# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
700	# If the $force_encode parameter is true, then this method WILL base64 encode whatever
701	# string is passed in, including any plain ASCII string.
702	# That means this method could double encode an already encoded string.
703	# However, this method is necessary because on the GLI end, we can't detect whether a plain
704	# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
705	# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
706	# (except the "import" prefix, which is special) or none of the parts.
707	sub force_base64_encode {
708	my ($text) = @_;
709	my $force_encode = 1;
710	my $no_newlines = 1;
711	return &base64_encode($text, $force_encode, $no_newlines);
712	}
713
714	sub base64_encode {
715	my ($text, $force_encode, $no_newlines) = @_;
716	if($force_encode \|\| !&conforms_to_mod_base64($text)) {
717	# return entity for underscore to underscore before encoding
718	$text =~ s/&\#095;/_/g;
719
720	$text = &MIME::Base64::encode_base64($text);
721	# base64 encoding may introduce + and / signs,
722	# replacing them with - and _ to ensure it's filename-safe
723	$text =~ s/\+/\-/g; # + -> -
724	$text =~ s/\//\_/g; # / -> _
725
726	# by default base64 encoding a long string introduces newlines to break long strings,
727	# see https://stackoverflow.com/questions/19952621/is-it-ok-to-remove-newline-in-base64-encoding
728	# Java doesn't like that
729	$text =~ s/\n//gs if $no_newlines;
730	}
731	return $text;
732	}
733
734	# If the input fits the modified base64 pattern, this will try decoding it.
735	# Still, this method does not guarantee the return value is the 'original', only
736	# that the result is where the base64 decoding process has been applied once.
737	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
738	sub base64_decode {
739	my ($text) = @_;
740	if(&conforms_to_mod_base64($text)) {
741	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
742	# have replaced them with - and _ respectively. Undo this before decoding.
743	$text =~ s/\-/\+/g; # - -> +
744	$text =~ s/\_/\//g; # _ -> /
745	$text = &MIME::Base64::decode_base64($text);
746	}
747	return $text;
748	}
749
750	# Returns true if the given string is compatible with a modified version
751	# of base64 (where the + and / are replaced with - and _), a format which
752	# includes also regular ASCII alphanumeric values. This method does not
753	# guarantee that the given string is actually base64 encoded, since it will
754	# return true for any simple alphanumeric ASCII string as well.
755	sub conforms_to_mod_base64 {
756	my ($text) = @_;
757
758	# need to treat the entity ref for underscore as underscore
759	$text =~ s/&\#095;/_/g;
760
761	# base 64 takes alphanumeric and [=+/],
762	# but we use modified base64 where + and / are replaced with - and _
763	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
764	}
765
766	sub substr
767	{
768	my ($utf8_string, $offset, $length) = @_;
769
770	my @unicode_string = @{&utf82unicode($utf8_string)};
771	my $unicode_string_length = scalar(@unicode_string);
772
773	my $substr_start = $offset;
774	if ($substr_start >= $unicode_string_length) {
775	return "";
776	}
777
778	my $substr_end = $offset + $length - 1;
779	if ($substr_end >= $unicode_string_length) {
780	$substr_end = $unicode_string_length - 1;
781	}
782
783	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
784	return &unicode2utf8(\@unicode_substring);
785	}
786
787	# perl version of stringToHex
788	# Useful method to print UTF8 (or other unicode) for debugging.
789	# Characters that are easily displayed (that is, printable ASCII)
790	# are shown as-is, whereas hex values of the unicode code points
791	# are shown for all other chars.
792	sub debug_unicode_string
793	{
794	join("",
795	map { $_ > 127 ? # if wide character...
796	sprintf("\\x{%04X}", $_) : # \x{...}
797	chr($_)
798	} unpack("U*", $_[0])); # unpack Unicode characters
799	}
800
801	sub raw_filename_to_url_encoded
802	{
803	my ($str_in) = @_;
804
805	my @url_encoded_chars
806	= map { $_ > 255 ? # Needs to be represent in entity form
807	sprintf("&#x%X;",$_) :
808	$_>127 \|\| $_==ord("%") ? # Representable in %XX form
809	sprintf("%%%2X", $_) :
810	chr($_) # otherwise, Ascii char
811	} unpack("U*", $str_in); # Unpack Unicode characters
812
813
814	my $str_out = join("", @url_encoded_chars);
815
816	return $str_out;
817
818	}
819
820	sub url_encoded_to_raw_filename
821	{
822	my ($str_in) = @_;
823
824	my $str_out = $str_in;
825
826	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
827	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
828	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
829
830	return $str_out;
831	}
832
833
834	sub raw_filename_to_utf8_url_encoded
835	{
836	my ($str_in) = @_;
837
838	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
839
840	my @url_encoded_chars
841	= map { $_ > 127 ? # Representable in %XX form
842	sprintf("%%%2X", $_) :
843	chr($_) # otherwise, Ascii char
844	} unpack("U*", $str_in); # Unpack utf8 characters
845
846
847	my $str_out = join("", @url_encoded_chars);
848
849	return $str_out;
850
851	}
852
853	sub utf8_url_encoded_to_raw_filename
854	{
855	my ($str_in) = @_;
856
857	my $utf8_str_out = $str_in;
858
859	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
860
861	my $unicode_str_out = decode("utf8",$utf8_str_out);
862	my $raw_str_out = utf8::downgrade($unicode_str_out);
863
864	return $raw_str_out;
865	}
866
867	sub analyze_raw_string
868	{
869	my ($str_in) = @_;
870
871	my $uses_bytecodes = 0;
872	my $exceeds_bytecodes = 0;
873
874	map { $exceeds_bytecodes = 1 if ($_ >= 256);
875	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
876	} unpack("U*", $str_in); # Unpack Unicode characters
877
878	return ($uses_bytecodes,$exceeds_bytecodes);
879	}
880
881
882	sub convert_utf8_string_to_unicode_string
883	{
884	my $utf8_string = shift(@_);
885
886	my $unicode_string = "";
887	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
888	$unicode_string .= chr($unicode_value);
889	}
890	return $unicode_string;
891	}
892
893	sub convert_unicode_string_to_utf8_string
894	{
895	my $unicode_string = shift(@_);
896
897	my @unicode_array;
898	for (my $i = 0; $i < length($unicode_string); $i++) {
899	push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
900	}
901	return &unicode::unicode2utf8(\@unicode_array);
902	}
903
904
905	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: