Context Navigation

← Previous Change
Next Change →

Changeset 23387 for main

Timestamp:

2010-12-06T13:15:10+13:00 (13 years ago)

Author:

davidb

Message:

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

Location:

main/trunk/greenstone2/perllib

Files:

: 8 edited

basebuildproc.pm (modified) (1 diff)
doc.pm (modified) (1 diff)
plugins/BasePlugin.pm (modified) (4 diffs)
plugins/ConvertBinaryFile.pm (modified) (6 diffs)
plugins/HTMLPlugin.pm (modified) (16 diffs)
plugins/ReadTextFile.pm (modified) (1 diff)
plugouts/BasePlugout.pm (modified) (2 diffs)
unicode.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/basebuildproc.pm

-              r23371
+              r23387
         $field =~ s/^ex\.//;
         # special case for URL metadata
         if ($field =~ m/^URL$/i) {
+        # special case for UTF8URL metadata
+        if ($field =~ m/^UTF8URL$/i) {
             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
                         $value, { 'section' => [ $section_OID ] });

main/trunk/greenstone2/perllib/doc.pm

-              r23362
+              r23387
     $self->delete_assoc_file ($assoc_filename);
+    if (!&util::fd_exists($real_filename)) {
+    print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
+    exit -1;
+    }
+#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
+#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
+##    my $utf8_filename = Encode::encode("utf8",$filename);
     push (@{$self->{'associated_files'}},
       [$real_filename, $assoc_filename, $mime_type, $section]);

main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

-              r23364
+              r23387
     # UTF-8 version of filename
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     print STDERR "**** Setting Source Metadata given: $octet_file\n";
+    }
+#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
+#   print STDERR "**** Setting Source Metadata given: $octet_file\n";
+#    }
     # Deal with (on Windows) raw filenames that are in their
 …
     if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
         if (-e $raw_filename) {
-        require Win32;
-##      print STDERR "**** raw filename before LPN: $raw_filename\n";
         my $unicode_filename = Win32::GetLongPathName($raw_filename);
         my $unused_full_uf;
         ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
-##      print STDERR "**** raw filename after LPN: $raw_filename\n";
+        }
+    }
 …
+    }
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     print STDERR "****** saving Source as:             $url_encoded_filename\n";
+    }
+#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
+#   print STDERR "****** saving Source as:             $url_encoded_filename\n";
+#    }
 …
                     $renamed_raw_url);
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
+    }
+#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
+#   print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
+#    }
+}

main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

-              r23363
+              r23387
     my $convert_to_ext = $self->{'convert_to_ext'};
+    my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
     # derive tmp filename from input filename
     my ($tailname, $dirname, $suffix)
     = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
+    = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
     # softlink to collection tmp dir
 …
+    }
+    # convert to utf-8 otherwise we have problems with the doc.xml file later on
+#    print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
+    $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
+#    # convert to utf-8 otherwise we have problems with the doc.xml file later on
+#    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
+    # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
+     my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
     # URLEncode this since htmls with images where the html filename is utf8 don't seem
     # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
     # files on the filesystem.
     $tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
     $suffix = lc($suffix);
     my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
+    $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
+    my $lc_suffix = lc($suffix);
+    my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
     # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
 …
+    }
     } else {
     $output_filename =~ s/$suffix$/.$output_type/;
+    $output_filename =~ s/$lc_suffix$/.$output_type/;
+    }
 …
     # need to check that not empty
     my ($doc_ext) = $file =~ /\.(\w+)$/;
+    $doc_ext = lc($doc_ext);
     my $file_type = "unknown";
     $file_type = $self->{'file_type'} if defined $self->{'file_type'};
 …
     $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
+    }
     $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
 …
      my $tmp_dir = $self->{'tmp_dir'};
      if (defined $tmp_dir && -d $tmp_dir) {
+##   print STDERR "**** Supressing clean up of tmp dir\n";
      &util::rm_r($tmp_dir);
      $self->{'tmp_dir'} = undef;

main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

-              r23371
+              r23387
 use Encode;
+use Unicode::Normalize 'normalize';
 use ReadTextFile;
 …
     my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
     if(!defined $self->{'utf8_to_original_filename'}) {
+    if(!defined $self->{'unicode_to_original_filename'}) {
     # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
     $self->{'utf8_to_original_filename'} = {};
+    }
     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
+    $self->{'unicode_to_original_filename'} = {};
+    }
+    foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
     # remove quotes from link at start and end if necessary
     if ($link =~ m/^\"/) {
         $link =~ s/^\"//;
         $link =~ s/\"$//;
+    if ($raw_link =~ m/^\"/) {
+        $raw_link =~ s/^\"//;
+        $raw_link =~ s/\"$//;
+    }
     # remove any anchor names, e.g. foo.html#name becomes foo.html
     # but watch out for any #'s that are part of entities, such as &#x3B1;
     $link =~ s/([^&])\#.*$/$1/s;
+    $raw_link =~ s/([^&])\#.*$/$1/s;
     # some links may just be anchor names
     next unless ($link =~ /\S+/);
     if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
+    next unless ($raw_link =~ /\S+/);
+    if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/) {
         # Turn relative file path into full path
         my $dirname = &File::Basename::dirname($filename_full_path);
         $link = &util::filename_cat($dirname, $link);
+    }
     $link = $self->eval_dir_dots($link);
+        $raw_link = &util::filename_cat($dirname, $raw_link);
+    }
+    $raw_link = $self->eval_dir_dots($raw_link);
     # this is the actual filename on the filesystem (that the link refers to)
+    my $url_original_filename = $self->opt_url_decode($link);
+    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
+    my $utf8_link = "";
+    $self->decode_text($link,$content_encoding,$language,\$utf8_link);
+    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
+#   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
+    if ($url_original_filename ne $utf8_link) {
+    my $url_original_filename = $self->opt_url_decode($raw_link);
+    my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
+    if ($exceeds_bytecodes) {
+        # We have a link to a file name that is more complicated than a raw byte filename
+        # What we do next depends on the operating system we are on
+        if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
+        # Assume we're dealing with a UTF-8 encoded filename
+        $url_original_filename = encode("utf8", $url_original_filename);
+        }
+        elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
+        # HFS+ is UTF8 with decompostion
+        $url_original_filename = encode("utf8", $url_original_filename);
+        $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)
+        }
+        elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
+        # Don't need to do anything as later code maps Windows
+        # unicode filenames to DOS short filenames when needed
+        }
+        else {
+        my $outhandle = $self->{'outhandle'};
+        print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
+        print $outhandle "         in raw file system encoding of: $raw_link\n";
+        print $outhandle "         Assuming filesystem is UTF-8 based.\n";
+        $url_original_filename = encode("utf8", $url_original_filename);
+        }
+    }
+    # Convert the (currently raw) link into its Unicode version.
+    # Store the Unicode link along with the url_original_filename
+    my $unicode_url_original_filename = "";
+    $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
+    $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
+    if ($url_original_filename ne $unicode_url_original_filename) {
         my $outhandle = $self->{'outhandle'};
         print $outhandle "URL Encoding $url_original_filename\n";
+        print $outhandle " ->$utf8_link\n";
+    }
+    my $unicode_url_original_filename = decode("utf8",$url_original_filename);
+##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
+    # Allow for possibility of raw byte version (UTF8) and Unicode versions of file
+        print $outhandle " ->$unicode_url_original_filename\n";
+        # Allow for possibility of raw byte version and Unicode versions of file
+        $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
+    }
     $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
-    $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
+    }
+}
 …
 # string already exists in the local folder.
+#
-# Is the following still true??
-# Return the original filename corresponding to the parameter URL-encoded filename, and
-# a decoded flag that is set to true iff URL-decoding had to be applied.
 sub opt_url_decode {
     my $self = shift (@_);
+    my ($link) = @_;
+    my ($raw_link) = @_;
     # Replace %XX's in URL with decoded value if required.
     # Note that the filename may include the %XX in some situations
+    if ($link =~ m/\%[A-F0-9]{2}/i) {
+    if (!-e $link) {
+        $link = &unicode::url_decode($link);
+##    if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
+    if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
+    if (!-e $raw_link) {
+        $raw_link = &unicode::url_decode($raw_link,1);
+    }
+    }
     return $link;
+    return $raw_link;
+}
 …
     # Need to make sure there is a '/' on the end of upgraded_base_dir
     if ($upgraded_base_dir !~ m/\/$/) {
+    if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
         $upgraded_base_dir .= "/";
+    }
 …
 #    my $utf8_file = $self->filename_to_utf8_metadata($file);
 #    $utf8_file =~ s/&\#095;/_/g;
+    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
+#    variable below used to be utf8_file
+    my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
+    my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
     my $web_url = "http://";
+    my $utf8_web_url = "http://";
     if(defined $dirname) { # local directory
         # Check for "ftp" in the domain name of the directory
 …
+    {
       $web_url = "ftp://";
+      $utf8_web_url = "ftp://";
+    }
     $dirname = $self->eval_dir_dots($dirname);
     $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
+    $web_url = $web_url.$dirname.$utf8_file;
+    $web_url = $web_url.$dirname.$url_encoded_file;
+    $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
     } else {
+    $web_url = $web_url.$utf8_file;
+    $web_url = $web_url.$url_encoded_file;
+    $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
+    }
     $web_url =~ s/\\/\//g;
+    $utf8_web_url =~ s/\\/\//g;
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     print STDERR "******* upgraded_file:       $upgraded_file\n";
     print STDERR "******* adding URL metadata: $utf8_file\n";
+    print STDERR "*******DEBUG: upgraded_file:       $upgraded_file\n";
+    print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
+    }
     $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
+    $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
     if ($self->{'file_is_url'}) {
 …
+    }
     # single section document
     $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
 …
 ##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
+    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
+    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
         $self->replace_usemap_links($1, $2, $3)/isge;
 ##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
     $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
+    $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
+        $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
+    }
 …
 sub replace_href_links {
     my $self = shift (@_);
+    my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
+    my ($opt_open_comm, $front, $link, $back, $opt_close_comm, $base_dir, $file, $doc_obj, $section) = @_;
+    if ((defined $opt_open_comm) && (defined $opt_close_comm)) {
+    # href link was embedded in <!-- comments -->
+### print STDERR "****** Link in comment, skipping $link\n";
+    return $opt_open_comm . $front . $link . $back . $opt_close_comm;
+    }
     # remove quotes from link at start and end if necessary
 …
+    }
     $href = &unicode::raw_filename_to_url_encoded($href);
+    $href = &unicode::raw_filename_to_utf8_url_encoded($href);
     $href = &unicode::filename_to_url($href);
 …
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
         print STDERR "****** href=$href\n";
+        print STDERR "******DEBUG: href=$href\n";
+    }
 …
     my $filename = $href;
     if ($base_dir eq "") {
+    # remove http:/ thereby leaving one slash at the start
+    $filename =~ s/^[^:]*:\///;
+    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
+        # remove http://
+        $filename =~ s/^[^:]*:\/\///;
+    }
+    else {
+        # remove http:/ thereby leaving one slash at the start as
+        # part of full pathname
+        $filename =~ s/^[^:]*:\///;
+    }
+    }
     else {
 …
     # file's name was in URL encoding, the following method will not decode
     # it.
+    my $utf8_filename = $filename;
+    my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
+    my $unicode_filename = $filename;
+    my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
+    # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
+    # (it seems) accompanying file
+    if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
     my $content_encoding= $self->{'content_encoding'} || "utf8";
+    # The filenames that come through the HTML file have been decoded
+    # into Unicode aware Perl strings.  Need to convert them back
+    # to their initial raw-byte encoding to match the file that
+    # exists on the file system
+    $filename = encode($content_encoding, $opt_decode_utf8_filename);
+    if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
+    # The filenames that come through the HTML file have been decoded
+    # into Unicode aware Perl strings.  Need to convert them back
+    # to their initial raw-byte encoding to match the file that
+    # exists on the file system
+    $filename = encode($content_encoding, $opt_decode_unicode_filename);
+    }
+    elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
+    # HFS+ is UTF8 with decompostion
+    $filename = encode($content_encoding, $opt_decode_unicode_filename);
+    $filename = normalize('D', $filename); # Normalization Form D (decomposition)
+    }
+    elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
+    my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
+    if (defined $long_filename) {
+        my $short_filename = Win32::GetLongPathName($long_filename);
+        $filename = $short_filename;
+    }
+#   else {
+#       print STDERR "***** failed to map href to real file:\n";
+#       print STDERR "****** $href -> $opt_decode_unicode_filename\n";
+#   }
+    }
+    else {
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
+    print $outhandle "         in file system encoding of href: $href\n";
+    print $outhandle "         No character encoding done.\n";
+    }
     # some special processing if the intended filename was converted to utf8, but
 …
+    }
 ##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
     my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
+##  print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
+    my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
         print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n";
+        print STDERR "******   From lookup unicode_filename, now trying for: $original_filename\n";
+    }
 …
     return "_httpdocimg_/$newname";
     } else {
     if(&unicode::is_url_encoded($utf8_filename)) {
+    if(&unicode::is_url_encoded($unicode_filename)) {
         # use the possibly-decoded filename instead to avoid double URL encoding
         ($newname) = $filename =~ m/([^\/\\]*)$/;
     } else {
         ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
+        ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
+    }
 …
     my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
+#   print STDERR "**** linkfilename = $linkfilename\n";
+#   if (!&util::fd_exists($linkfilename)) {
+#       print STDERR "***** Warning: Could not find $linkfilename\n";
+#   }
     # make sure there's a slash on the end if it's a directory
     if ($before_hash !~ m/\/$/) {
         $before_hash .= "/" if (-d $linkfilename);
+    }
+#   print STDERR "*** returning: $before_hash\n";
     return ("http://" . $before_hash, $hash_part, 1);
     } else {

main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

-              r23363
+              r23387
     $reader->set_encoding($encoding);
     $reader->decode_text($raw_text,$textref);
+    # At this point $$textref is a binary byte string
+    # => turn it into a Unicode aware string, so full
+    # Unicode aware pattern matching can be used.
+    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
+    $$textref = decode("utf8",$$textref);
+    }
+}

main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

-              r23363
+              r23387
     $utf8_real_filename =~ s/^\\(.*)/$1/i;
+    my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
+##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
+    my $real_filename = $utf8_real_filename;
+    $real_filename = &util::downgrade_if_dos_filename($real_filename);
     if (-e $real_filename) {
 …
     $real_filename =~ s/^\\(.*)/$1/i;
+    my $raw_filename;
+    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
+        # Need to generate DOS version of filename to test in '-e $raw_filename' below
+        require Win32;
+        my $unicode_filename = Encode::decode("utf8",$real_filename);
+###     print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n";
+        $raw_filename = Win32::GetShortPathName($unicode_filename);
+    }
+    else {
+        $raw_filename = $real_filename;
+    }
+    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
     if (-e $raw_filename) {

main/trunk/greenstone2/perllib/unicode.pm

-              r23371
+              r23387
 sub url_decode {
     my ($text) = @_;
+    my ($text,$and_numeric_entities) = @_;
     $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
+    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
+    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
+    if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
+    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
+    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
+    }
     return $text;
 …
+}
 sub url_encoded_to_raw_filename
+{
 …
+}
+sub raw_filename_to_utf8_url_encoded
+{
+    my ($str_in) = @_;
+    $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
+    my @url_encoded_chars
+    = map { $_ > 128 ?                  # Representable in %XX form
+            sprintf("%%%2X", $_) :
+            chr($_)                 # otherwise, Ascii char
+        } unpack("U*", $str_in); # Unpack utf8 characters
+    my $str_out = join("", @url_encoded_chars);
+    return $str_out;
+}
+sub utf8_url_encoded_to_raw_filename
+{
+    my ($str_in) = @_;
+    my $utf8_str_out = $str_in;
+    $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
+    my $unicode_str_out = decode("utf8",$utf8_str_out);
+    my $raw_str_out = utf8::downgrade($unicode_str_out);
+    return $raw_str_out;
+}
+sub analyze_raw_string
+{
+    my ($str_in) = @_;
+    my $uses_bytecodes = 0;
+    my $exceeds_bytecodes = 0;
+    map { $exceeds_bytecodes = 1 if ($_ >= 256);
+      $uses_bytecodes    = 1 if (($_ >= 128) && ($_ < 256));
+    } unpack("U*", $str_in); # Unpack Unicode characters
+    return ($uses_bytecodes,$exceeds_bytecodes);
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: