Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 33757

Last change on this file since 33757 was 33757, checked in by ak19, 4 years ago
Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 24.5 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use FileUtils;
39	use MIME::Base64; # for base64 encoding
40
41	no strict 'refs';
42
43
44
45	sub utf8decomp
46	{
47	my ($str) = @_;
48
49	return if (!defined $str);
50	return "" if ($str eq "");
51
52	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54	my @each_char
55	= map { ($_ > 255 )
56	? # if wide character...
57	sprintf("\\x{%04X}", $_)
58	: # \x{...}
59	(chr($_) =~ m/[[:cntrl:]]/ )
60	? # else if control character ...
61	sprintf("\\x%02X", $_)
62	: # \x..
63	quotemeta(chr($_)) # else quoted or as themselves
64	} @unpacked_chars;
65
66	return join("",@each_char);
67	}
68
69
70	sub hex_codepoint {
71	if (my $char = shift) {
72	return sprintf '%2.2x', unpack('U0U*', $char);
73	}
74	}
75
76
77
78
79	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80	# and returns a unicode array.
81	sub ascii2unicode {
82	my ($in) = @_;
83	my $out = [];
84
85	my $i = 0;
86	my $len = length($in);
87	while ($i < $len) {
88	push (@$out, ord(substr ($in, $i, 1)));
89	$i++;
90	}
91
92	return $out;
93	}
94
95	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96	# UTF-8 encoded string. This is just a faster version of
97	# "&unicode2utf8(&ascii2unicode($str));"
98	# "Extended ascii" really means "iso_8859_1"
99	sub ascii2utf8 {
100	my ($in) = @_;
101	my $out = "";
102
103	if (!defined($in)\|\| !defined($$in)) {
104	return $out;
105	}
106
107	my ($c);
108	my $i = 0;
109	my $len = length($$in);
110	while ($i < $len) {
111	$c = ord (substr ($$in, $i, 1));
112	if ($c < 0x80) {
113	# ascii character
114	$out .= chr ($c);
115
116	} else {
117	# extended ascii character
118	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
119	$out .= chr (0x80 + ($c & 0x3f));
120	}
121	$i++;
122	}
123
124	return $out;
125	}
126
127	# unicode2utf8 takes a unicode array as input and encodes it
128	# using utf-8
129	sub unicode2utf8 {
130	my ($in) = @_;
131	my $out = "";
132
133	foreach my $num (@$in) {
134	next unless defined $num;
135	if ($num < 0x80) {
136	$out .= chr ($num);
137
138	} elsif ($num < 0x800) {
139	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
140	$out .= chr (0x80 + ($num & 0x3f));
141
142	} elsif ($num < 0xFFFF) {
143	$out .= chr (0xe0 + (($num >> 12) & 0xf));
144	$out .= chr (0x80 + (($num >> 6) & 0x3f));
145	$out .= chr (0x80 + ($num & 0x3f));
146
147	} else {
148	# error, don't encode anything
149	#die;
150	# Diego's bugfix: instead of aborting the import process, it
151	# is better to get a converted file with a few extra spaces
152	print STDERR "strange char: $num\n";
153	$out .= " ";
154
155	}
156	}
157	return $out;
158	}
159
160	# utf82unicode takes a utf-8 string and produces a unicode
161	# array
162	sub utf82unicode {
163	my ($in) = @_;
164	my $out = [];
165
166	if(!defined $in) {
167	return $out;
168	}
169
170	my $i = 0;
171	my ($c1, $c2, $c3);
172	my $len = length($in);
173	while ($i < $len) {
174	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175	# normal ascii character
176	push (@$out, $c1);
177
178	} elsif ($c1 < 0xc0) {
179	# error, was expecting the first byte of an
180	# encoded character. Do nothing.
181
182	} elsif ($c1 < 0xe0 && $i+1 < $len) {
183	# an encoded character with two bytes
184	$c2 = ord (substr ($in, $i+1, 1));
185	if ($c2 >= 0x80 && $c2 < 0xc0) {
186	# everything looks ok
187	push (@$out, ((($c1 & 0x1f) << 6) +
188	($c2 & 0x3f)));
189	$i++; # gobbled an extra byte
190	}
191
192	} elsif ($c1 < 0xf0 && $i+2 < $len) {
193	# an encoded character with three bytes
194	$c2 = ord (substr ($in, $i+1, 1));
195	$c3 = ord (substr ($in, $i+2, 1));
196	if ($c2 >= 0x80 && $c2 < 0xc0 &&
197	$c3 >= 0x80 && $c3 < 0xc0) {
198	# everything looks ok
199	push (@$out, ((($c1 & 0xf) << 12) +
200	(($c2 & 0x3f) << 6) +
201	($c3 & 0x3f)));
202
203	$i += 2; # gobbled an extra two bytes
204	}
205
206	} else {
207	# error, only decode Unicode characters not full UCS.
208	# Do nothing.
209	}
210
211	$i++;
212	}
213
214	return $out;
215	}
216
217	# unicode2ucs2 takes a unicode array and produces a UCS-2
218	# unicode string (every two bytes forms a unicode character)
219	sub unicode2ucs2 {
220	my ($in) = @_;
221	my $out = "";
222
223	foreach my $num (@$in) {
224	$out .= chr (($num & 0xff00) >> 8);
225	$out .= chr ($num & 0xff);
226	}
227
228	return $out;
229	}
230
231	# ucs22unicode takes a UCS-2 string and produces a unicode array
232	sub ucs22unicode {
233	my ($in) = @_;
234	my $out = [];
235
236	my $i = 0;
237	my $len = length ($in);
238	while ($i+1 < $len) {
239	push (@$out, ord (substr($in, $i, 1)) << 8 +
240	ord (substr($in, $i+1, 1)));
241
242	$i ++;
243	}
244
245	return $out;
246	}
247
248	# takes a reference to a string and returns a reference to a unicode array
249	sub convert2unicode {
250	my ($encoding, $textref) = @_;
251
252	if (!defined $encodings::encodings->{$encoding}) {
253	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254	return [];
255	}
256
257	my $encodename = "$encoding-unicode";
258	my $enc_info = $encodings::encodings->{$encoding};
259	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260	"to_uc", $enc_info->{'mapfile'});
261	if (!&loadmapencoding ($encodename, $mapfile)) {
262	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263	return [];
264	}
265
266	if (defined $enc_info->{'converter'}) {
267	my $converter = $enc_info->{'converter'};
268	return &$converter ($encodename, $textref);
269	}
270
271	if ($unicode::translations{$encodename}->{'count'} == 1) {
272	return &singlebyte2unicode ($encodename, $textref);
273	} else {
274	return &doublebyte2unicode ($encodename, $textref);
275	}
276	}
277
278	# singlebyte2unicode converts simple 8 bit encodings where characters below
279	# 0x80 are normal ascii characters and the rest are decoded using the
280	# appropriate mapping files.
281	#
282	# Examples of encodings that may be converted using singlebyte2unicode are
283	# the iso-8859 and windows-125* series.
284	sub singlebyte2unicode {
285	my ($encodename, $textref) = @_;
286
287	my @outtext = ();
288	my $len = length($$textref);
289	my ($c);
290	my $i = 0;
291
292	while ($i < $len) {
293	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294	# normal ascii character
295	push (@outtext, $c);
296	} else {
297	$c = &transchar ($encodename, $c);
298	# put a black square if cannot translate
299	$c = 0x25A1 if $c == 0;
300	push (@outtext, $c);
301	}
302	$i ++;
303	}
304	return \@outtext;
305	}
306
307	# doublebyte2unicode converts simple two byte encodings where characters
308	# below code point 0x80 are single-byte characters and the rest are
309	# double-byte characters.
310	#
311	# Examples of encodings that may be converted using doublebyte2unicode are
312	# CJK encodings like GB encoded Chinese and UHC Korean.
313	#
314	# Note that no error checking is performed to make sure that the input text
315	# is valid for the given encoding.
316	#
317	# Also, encodings that may contain characters of more than two bytes are
318	# not supported (any EUC encoded text may in theory contain 3-byte
319	# characters but in practice only one and two byte characters are used).
320	sub doublebyte2unicode {
321	my ($encodename, $textref) = @_;
322
323	my @outtext = ();
324	my $len = length($$textref);
325	my ($c1, $c2);
326	my $i = 0;
327
328	while ($i < $len) {
329	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330	if ($i+1 < $len) {
331	# double-byte character
332	$c2 = ord(substr($$textref, $i+1, 1));
333	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
334	# put a black square if cannot translate
335	$c = 0x25A1 if $c == 0;
336	push (@outtext, $c);
337	$i += 2;
338
339	} else {
340	# error
341	print STDERR "unicode: ERROR missing second half of double-byte character\n";
342	$i++;
343	}
344
345	} else {
346	# single-byte character
347	push (@outtext, $c1);
348	$i++;
349	}
350	}
351	return \@outtext;
352	}
353
354	# Shift-JIS to unicode
355	# We can't use doublebyte2unicode for Shift-JIS because it uses some
356	# single-byte characters above code point 0x80 (i.e. half-width katakana
357	# characters in the range 0xA1-0xDF)
358	sub shiftjis2unicode {
359	my ($encodename, $textref) = @_;
360
361	my @outtext = ();
362	my $len = length($$textref);
363	my ($c1, $c2);
364	my $i = 0;
365
366	while ($i < $len) {
367	$c1 = ord(substr($$textref, $i, 1));
368
369	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
370	# Single-byte half-width katakana character or
371	# JIS Roman yen or overline characters
372	my $c = &transchar ($encodename, $c1);
373	# - put a black square if cannot translate
374	$c = 0x25A1 if $c == 0;
375	push (@outtext, $c);
376	$i++;
377
378	} elsif ($c1 < 0x80) {
379	# ASCII
380	push (@outtext, $c1);
381	$i ++;
382
383	} elsif ($c1 < 0xEF) {
384	if ($i+1 < $len) {
385	$c2 = ord(substr($$textref, $i+1, 1));
386	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
387	# Double-byte shift-jis character
388	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
389	# put a black square if cannot translate
390	$c = 0x25A1 if $c == 0;
391	push (@outtext, $c);
392	} else {
393	# error
394	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395	}
396	$i += 2;
397	} else {
398	# error
399	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400	$i ++;
401	}
402	} else {
403	# error
404	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405	$i ++;
406	}
407	}
408	return \@outtext;
409	}
410
411	sub transchar {
412	my ($encoding, $from) = @_;
413	my $high = ($from / 256) % 256;
414	my $low = $from % 256;
415
416	return 0 unless defined $unicode::translations{$encoding};
417
418	my $block = $unicode::translations{$encoding}->{'map'};
419
420	if (ref ($block->[$high]) ne "ARRAY") {
421	return 0;
422	}
423	return $block->[$high]->[$low];
424	}
425
426	# %translations is of the form:
427	#
428	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430	#
431	# Any of the top translation blocks can point to an undefined
432	# value. This data structure aims to allow fast translation and
433	# efficient storage.
434	%unicode::translations = ();
435
436	# @array256 is used for initialisation, there must be
437	# a better way...
438	# What about this?: @array256 = (0) x 256;
439	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456	# returns 1 if successful, 0 if unsuccessful
457	sub loadmapencoding {
458	my ($encoding, $mapfile) = @_;
459
460	# check to see if the encoding has already been loaded
461	return 1 if (defined $unicode::translations{$encoding});
462
463	if (! -r $mapfile \|\| -d $mapfile) {
464	return 0;
465	}
466	return 0 unless open (MAPFILE, $mapfile);
467	binmode (MAPFILE);
468
469	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470	my $block = $unicode::translations{$encoding};
471
472	my ($in,$i,$j);
473	while (1) {
474	my $ret=read(MAPFILE, $in, 1);
475	if (!defined($ret)) { # error
476	print STDERR "unicode.pm: error reading mapfile: $!\n";
477	last;
478	}
479	if ($ret != 1) { last }
480	$i = unpack ("C", $in);
481	$block->{'map'}->[$i] = [@unicode::array256];
482	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483	my ($n1, $n2) = unpack ("CC", $in);
484	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485	}
486	$block->{'count'} ++;
487	}
488
489	close (MAPFILE);
490	}
491
492	# unicode2singlebyte converts unicode to simple 8 bit encodings where
493	# characters below 0x80 are normal ascii characters and the rest are encoded
494	# using the appropriate mapping files.
495	#
496	# Examples of encodings that may be converted using unicode2singlebyte are
497	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498	sub unicode2singlebyte {
499	my ($uniref, $encoding) = @_;
500
501	my $outtext = "";
502	my $encodename = "unicode-$encoding";
503
504	if (!exists $encodings::encodings->{$encoding}) {
505	print STDERR "unicode.pm: ERROR - unsupported encoding "
506	. "'$encoding' requested\n";
507	return "";
508	}
509
510	my $enc_info = $encodings::encodings->{$encoding};
511	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512	"from_uc", $enc_info->{'mapfile'});
513	if (!&loadmapencoding ($encodename, $mapfile)) {
514	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515	return "";
516	}
517
518	foreach my $c (@$uniref) {
519	if ($c < 0x80) {
520	# normal ascii character
521	$outtext .= chr($c);
522	} else {
523	# extended ascii character
524	$c = &transchar ($encodename, $c);
525
526	# put a question mark if cannot translate
527	if ($c == 0) {
528	$outtext .= "?";
529	} else {
530	$outtext .= chr($c);
531	}
532	}
533	}
534	return $outtext;
535	}
536
537
538	# this makes sure that the referenced input string is utf8 encoded, and
539	# will change/remove bytes that aren't.
540	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541	sub ensure_utf8 {
542	my $stringref=shift;
543
544	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
545	return $stringref;
546	}
547
548	my $value=$$stringref;
549
550	my $non_utf8_found = 0;
551	$value =~ m/^/g; # to set \G
552	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553	my $highbytes=$1;
554	my $highbyteslength=length($highbytes);
555	# make sure this block of high bytes is utf-8
556	$highbytes =~ /^/g; # set pos()
557	my $byte_replaced = 0;
558	while ($highbytes =~
559	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
560	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
561	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
562	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
563	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
564	)*([\x80-\xff])? !xg
565	) {
566	# this highbyte is "out-of-place" for valid utf-8
567	my $badbyte=$1;
568	if (!defined $badbyte) {next} # hit end of string
569	my $pos=pos($highbytes);
570	# replace bad byte. assume iso-8859-1 -> utf-8
571	# ascii2utf8 does "extended ascii"... ie iso-8859-1
572	my $replacement=&unicode::ascii2utf8(\$badbyte);
573	substr($highbytes, $pos-1, 1, $replacement);
574	# update the position to continue searching (for \G)
575	pos($highbytes) = $pos+length($replacement)-1;
576	$byte_replaced = 1;
577	}
578	if ($byte_replaced) {
579	# replace this block of high bytes in the $value
580	$non_utf8_found = 1;
581	my $replength=length($highbytes); # we've changed the length
582	my $textpos=pos($value); # pos at end of last match
583	# replace bad bytes with good bytes
584	substr($value, $textpos-$highbyteslength,
585	$highbyteslength, $highbytes);
586	# update the position to continue searching (for \G)
587	pos($value)=$textpos+($replength-$highbyteslength)+1;
588	}
589	}
590
591	$$stringref = $value;
592	return $non_utf8_found;
593	}
594
595	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596	# Does not modify the string parameter.
597	sub check_is_utf8 {
598	my $value=shift;
599
600	if (!defined($value)) {
601	return 0; # not utf8 because it is undefined
602	}
603
604	$value =~ m/^/g; # to set \G
605	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606	my $highbytes=$1;
607	# make sure this block of high bytes is utf-8
608	$highbytes =~ /^/g; # set pos()
609	while ($highbytes =~
610	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
611	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
612	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
613	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
614	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
615	)*([\x80-\xff])? !xg
616	) {
617	my $badbyte=$1;
618	if (defined $badbyte) { # not end of string
619	return 0; # non-utf8 found
620	}
621	}
622	}
623
624	return 1;
625	}
626
627	sub url_encode {
628	my ($text) = @_;
629
630	if (!&is_url_encoded($text)) {
631	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632	# return the url-encoded character entity for underscore back to the entity
633	$text =~ s/%26%23095%3B/&\#095;/g;
634	}
635	return $text;
636	}
637
638	sub url_decode {
639	my ($text,$and_numeric_entities) = @_;
640
641	if(defined $text) {
642	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643
644	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647	}
648	}
649
650	return $text;
651	}
652
653	sub url_decode_utf8 {
654	my ($text,$and_numeric_entities) = @_;
655
656	$text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658	$text = Encode::decode("utf8",$text);
659
660	return $text;
661	}
662
663	sub is_url_encoded {
664	my ($text) = @_;
665	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
666	}
667
668	# When a filename on the filesystem is already URL-encoded, the
669	# URL to it will have %25s in place of every % sign, so that
670	# URLs in html pages can refer to the URL-encoded filename.
671	# This method changes the URL reference back into the actual
672	# (URL-encoded) filename on the filesystem by replacing %25 with %.
673	sub url_to_filename {
674	my ($text) =@_;
675	$text =~ s/%25/%/g if &is_url_encoded($text);
676	# DM safing would have replaced underscores with character entity _
677	# in SourceFile meta. Undo any such change to get the filename referred to.
678	$text =~ s/&\#095;/_/g;
679	return $text;
680	}
681
682	# When a filename on the filesystem is already URL-encoded, the
683	# URL to it will have %25s in place of every % sign, so that
684	# URLs in html pages can refer to the URL-encoded filename.
685	# Given a (URL-encoded) filename on the filesystem, this subroutine
686	# returns the URL reference string for it by replacing % with %25.
687	# The output string will be the same as the input string if the input
688	# already contains one or more %25s. This is to prevent processing
689	# a url more than once this way.
690	sub filename_to_url {
691	my ($text) = @_;
692
693	if($text !~ m/%25/) {
694	$text =~ s/%/%25/g;
695	}
696	return $text;
697	}
698
699	# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
700	# If the $force_encode parameter is true, then this method WILL base64 encode whatever
701	# string is passed in, including any plain ASCII string.
702	# That means this method could double encode an already encoded string.
703	# However, this method is necessary because on the GLI end, we can't detect whether a plain
704	# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
705	# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
706	# (except the "import" prefix, which is special) or none of the parts.
707	sub force_base64_encode {
708	my ($text) = @_;
709	my $force_encode = 1;
710	return &base64_encode($text, $force_encode);
711	}
712
713	sub base64_encode {
714	my ($text, $force_encode) = @_;
715	if($force_encode \|\| !&conforms_to_mod_base64($text)) {
716	# return entity for underscore to underscore before encoding
717	$text =~ s/&\#095;/_/g;
718
719	$text = &MIME::Base64::encode_base64($text);
720	# base64 encoding may introduce + and / signs,
721	# replacing them with - and _ to ensure it's filename-safe
722	$text =~ s/\+/\-/g; # + -> -
723	$text =~ s/\//\_/g; # / -> _
724	}
725	return $text;
726	}
727
728	# If the input fits the modified base64 pattern, this will try decoding it.
729	# Still, this method does not guarantee the return value is the 'original', only
730	# that the result is where the base64 decoding process has been applied once.
731	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
732	sub base64_decode {
733	my ($text) = @_;
734	if(&conforms_to_mod_base64($text)) {
735	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
736	# have replaced them with - and _ respectively. Undo this before decoding.
737	$text =~ s/\-/\+/g; # - -> +
738	$text =~ s/\_/\//g; # _ -> /
739	$text = &MIME::Base64::decode_base64($text);
740	}
741	return $text;
742	}
743
744	# Returns true if the given string is compatible with a modified version
745	# of base64 (where the + and / are replaced with - and _), a format which
746	# includes also regular ASCII alphanumeric values. This method does not
747	# guarantee that the given string is actually base64 encoded, since it will
748	# return true for any simple alphanumeric ASCII string as well.
749	sub conforms_to_mod_base64 {
750	my ($text) = @_;
751
752	# need to treat the entity ref for underscore as underscore
753	$text =~ s/&\#095;/_/g;
754
755	# base 64 takes alphanumeric and [=+/],
756	# but we use modified base64 where + and / are replaced with - and _
757	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
758	}
759
760	sub substr
761	{
762	my ($utf8_string, $offset, $length) = @_;
763
764	my @unicode_string = @{&utf82unicode($utf8_string)};
765	my $unicode_string_length = scalar(@unicode_string);
766
767	my $substr_start = $offset;
768	if ($substr_start >= $unicode_string_length) {
769	return "";
770	}
771
772	my $substr_end = $offset + $length - 1;
773	if ($substr_end >= $unicode_string_length) {
774	$substr_end = $unicode_string_length - 1;
775	}
776
777	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
778	return &unicode2utf8(\@unicode_substring);
779	}
780
781	# perl version of stringToHex
782	# Useful method to print UTF8 (or other unicode) for debugging.
783	# Characters that are easily displayed (that is, printable ASCII)
784	# are shown as-is, whereas hex values of the unicode code points
785	# are shown for all other chars.
786	sub debug_unicode_string
787	{
788	join("",
789	map { $_ > 127 ? # if wide character...
790	sprintf("\\x{%04X}", $_) : # \x{...}
791	chr($_)
792	} unpack("U*", $_[0])); # unpack Unicode characters
793	}
794
795
796	sub raw_filename_to_url_encoded
797	{
798	my ($str_in) = @_;
799
800	my @url_encoded_chars
801	= map { $_ > 255 ? # Needs to be represent in entity form
802	sprintf("&#x%X;",$_) :
803	$_>127 \|\| $_==ord("%") ? # Representable in %XX form
804	sprintf("%%%2X", $_) :
805	chr($_) # otherwise, Ascii char
806	} unpack("U*", $str_in); # Unpack Unicode characters
807
808
809	my $str_out = join("", @url_encoded_chars);
810
811	return $str_out;
812
813	}
814
815	sub url_encoded_to_raw_filename
816	{
817	my ($str_in) = @_;
818
819	my $str_out = $str_in;
820
821	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
822	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
823	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
824
825	return $str_out;
826	}
827
828
829	sub raw_filename_to_utf8_url_encoded
830	{
831	my ($str_in) = @_;
832
833	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
834
835	my @url_encoded_chars
836	= map { $_ > 127 ? # Representable in %XX form
837	sprintf("%%%2X", $_) :
838	chr($_) # otherwise, Ascii char
839	} unpack("U*", $str_in); # Unpack utf8 characters
840
841
842	my $str_out = join("", @url_encoded_chars);
843
844	return $str_out;
845
846	}
847
848	sub utf8_url_encoded_to_raw_filename
849	{
850	my ($str_in) = @_;
851
852	my $utf8_str_out = $str_in;
853
854	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
855
856	my $unicode_str_out = decode("utf8",$utf8_str_out);
857	my $raw_str_out = utf8::downgrade($unicode_str_out);
858
859	return $raw_str_out;
860	}
861
862	sub analyze_raw_string
863	{
864	my ($str_in) = @_;
865
866	my $uses_bytecodes = 0;
867	my $exceeds_bytecodes = 0;
868
869	map { $exceeds_bytecodes = 1 if ($_ >= 256);
870	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
871	} unpack("U*", $str_in); # Unpack Unicode characters
872
873	return ($uses_bytecodes,$exceeds_bytecodes);
874	}
875
876
877	sub convert_utf8_string_to_unicode_string
878	{
879	my $utf8_string = shift(@_);
880
881	my $unicode_string = "";
882	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
883	$unicode_string .= chr($unicode_value);
884	}
885	return $unicode_string;
886	}
887
888	sub convert_unicode_string_to_utf8_string
889	{
890	my $unicode_string = shift(@_);
891
892	my @unicode_array;
893	for (my $i = 0; $i < length($unicode_string); $i++) {
894	push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
895	}
896	return &unicode::unicode2utf8(\@unicode_array);
897	}
898
899
900	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: