Changeset 23387


Ignore:
Timestamp:
12/06/10 13:15:10 (11 years ago)
Author:
davidb
Message:

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

Location:
main/trunk/greenstone2/perllib
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r23371 r23387  
    515515        $field =~ s/^ex\.//;
    516516
    517         # special case for URL metadata
    518         if ($field =~ m/^URL$/i) {
     517        # special case for UTF8URL metadata
     518        if ($field =~ m/^UTF8URL$/i) {
    519519            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
    520520                        $value, { 'section' => [ $section_OID ] });
  • main/trunk/greenstone2/perllib/doc.pm

    r23362 r23387  
    10891089    $self->delete_assoc_file ($assoc_filename);
    10901090   
     1091    if (!&util::fd_exists($real_filename)) {
     1092    print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
     1093    exit -1;
     1094    }
     1095#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
     1096#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
     1097##    my $utf8_filename = Encode::encode("utf8",$filename);
     1098
    10911099    push (@{$self->{'associated_files'}},
    10921100      [$real_filename, $assoc_filename, $mime_type, $section]);
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23364 r23387  
    927927
    928928    # UTF-8 version of filename
    929     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    930     print STDERR "**** Setting Source Metadata given: $octet_file\n";
    931     }
     929#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     930#   print STDERR "**** Setting Source Metadata given: $octet_file\n";
     931#    }
    932932   
    933933    # Deal with (on Windows) raw filenames that are in their
     
    937937    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) {
    938938        if (-e $raw_filename) {
    939         require Win32;
    940        
    941 ##      print STDERR "**** raw filename before LPN: $raw_filename\n";
    942939        my $unicode_filename = Win32::GetLongPathName($raw_filename);
    943940       
    944941        my $unused_full_uf;
    945942        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename);
    946 
    947 ##      print STDERR "**** raw filename after LPN: $raw_filename\n";       
    948943        }
    949944    }
     
    962957    }
    963958   
    964     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    965     print STDERR "****** saving Source as:             $url_encoded_filename\n";
    966     }
     959#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     960#   print STDERR "****** saving Source as:             $url_encoded_filename\n";
     961#    }
    967962   
    968963   
     
    978973                    $renamed_raw_url);
    979974
    980     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    981     print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
    982     }
     975#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     976#   print STDERR "****** saving SourceFile as:         $renamed_raw_url\n";
     977#    }
    983978}
    984979   
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r23363 r23387  
    226226    my $convert_to_ext = $self->{'convert_to_ext'};
    227227   
     228
     229    my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename);
     230
    228231    # derive tmp filename from input filename
    229232    my ($tailname, $dirname, $suffix)
    230     = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
     233    = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$");
    231234
    232235    # softlink to collection tmp dir
     
    238241    }
    239242   
    240     # convert to utf-8 otherwise we have problems with the doc.xml file later on
    241 #    print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname);
    242     $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
     243#    # convert to utf-8 otherwise we have problems with the doc.xml file later on
     244#    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname);
     245
     246    # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file
     247     my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname);
     248
    243249
    244250    # URLEncode this since htmls with images where the html filename is utf8 don't seem
    245251    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
    246252    # files on the filesystem.
    247     $tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
    248 
    249     $suffix = lc($suffix);
    250     my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix");
     253    $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix");
     254
     255    my $lc_suffix = lc($suffix);
     256    my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix");
    251257   
    252258    # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix
     
    324330    }
    325331    } else {
    326     $output_filename =~ s/$suffix$/.$output_type/;
     332    $output_filename =~ s/$lc_suffix$/.$output_type/;
    327333    }
    328334   
     
    446452    # need to check that not empty
    447453    my ($doc_ext) = $file =~ /\.(\w+)$/;
     454    $doc_ext = lc($doc_ext);
    448455    my $file_type = "unknown";
    449456    $file_type = $self->{'file_type'} if defined $self->{'file_type'};
     
    458465    $assocfilename = $doc_obj->get_assocfile_from_sourcefile();
    459466    }
     467
    460468    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection);
    461469
     
    476484     my $tmp_dir = $self->{'tmp_dir'};
    477485     if (defined $tmp_dir && -d $tmp_dir) {
     486##   print STDERR "**** Supressing clean up of tmp dir\n";
    478487     &util::rm_r($tmp_dir);
    479488     $self->{'tmp_dir'} = undef;
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23371 r23387  
    3737
    3838use Encode;
     39use Unicode::Normalize 'normalize';
    3940
    4041use ReadTextFile;
     
    206207    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    207208
    208     if(!defined $self->{'utf8_to_original_filename'}) {
     209    if(!defined $self->{'unicode_to_original_filename'}) {
    209210    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
    210     $self->{'utf8_to_original_filename'} = {};
    211     }
    212 
    213     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
     211    $self->{'unicode_to_original_filename'} = {};
     212    }
     213
     214    foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
    214215
    215216    # remove quotes from link at start and end if necessary
    216     if ($link =~ m/^\"/) {
    217         $link =~ s/^\"//;
    218         $link =~ s/\"$//;
     217    if ($raw_link =~ m/^\"/) {
     218        $raw_link =~ s/^\"//;
     219        $raw_link =~ s/\"$//;
    219220    }
    220221
    221222    # remove any anchor names, e.g. foo.html#name becomes foo.html
    222223    # but watch out for any #'s that are part of entities, such as &#x3B1;
    223     $link =~ s/([^&])\#.*$/$1/s;
     224    $raw_link =~ s/([^&])\#.*$/$1/s;
    224225
    225226    # some links may just be anchor names
    226     next unless ($link =~ /\S+/);
    227 
    228     if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
     227    next unless ($raw_link =~ /\S+/);
     228
     229    if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/) {
    229230        # Turn relative file path into full path
    230231        my $dirname = &File::Basename::dirname($filename_full_path);
    231         $link = &util::filename_cat($dirname, $link);
    232     }
    233     $link = $self->eval_dir_dots($link);
     232        $raw_link = &util::filename_cat($dirname, $raw_link);
     233    }
     234    $raw_link = $self->eval_dir_dots($raw_link);
    234235
    235236    # this is the actual filename on the filesystem (that the link refers to)
    236     my $url_original_filename = $self->opt_url_decode($link);
    237 
    238     # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
    239     my $utf8_link = "";
    240     $self->decode_text($link,$content_encoding,$language,\$utf8_link);
    241 
    242     $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
    243 #   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
    244 
    245     if ($url_original_filename ne $utf8_link) {
     237    my $url_original_filename = $self->opt_url_decode($raw_link);
     238
     239    my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
     240
     241    if ($exceeds_bytecodes) {
     242        # We have a link to a file name that is more complicated than a raw byte filename
     243        # What we do next depends on the operating system we are on
     244
     245        if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
     246        # Assume we're dealing with a UTF-8 encoded filename
     247        $url_original_filename = encode("utf8", $url_original_filename);
     248        }
     249        elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
     250        # HFS+ is UTF8 with decompostion
     251        $url_original_filename = encode("utf8", $url_original_filename);
     252        $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)     
     253        }
     254        elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
     255        # Don't need to do anything as later code maps Windows
     256        # unicode filenames to DOS short filenames when needed     
     257        }
     258        else {
     259        my $outhandle = $self->{'outhandle'};
     260        print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
     261        print $outhandle "         in raw file system encoding of: $raw_link\n";
     262        print $outhandle "         Assuming filesystem is UTF-8 based.\n";
     263        $url_original_filename = encode("utf8", $url_original_filename);
     264        }
     265    }
     266
     267    # Convert the (currently raw) link into its Unicode version.
     268    # Store the Unicode link along with the url_original_filename
     269    my $unicode_url_original_filename = "";
     270    $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
     271
     272
     273    $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
     274
     275
     276    if ($url_original_filename ne $unicode_url_original_filename) {
    246277        my $outhandle = $self->{'outhandle'};
    247 
     278       
    248279        print $outhandle "URL Encoding $url_original_filename\n";
    249         print $outhandle " ->$utf8_link\n";
    250     }
    251 
    252    
    253     my $unicode_url_original_filename = decode("utf8",$url_original_filename);
    254 
    255 ##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
    256 
    257     # Allow for possibility of raw byte version (UTF8) and Unicode versions of file
     280        print $outhandle " ->$unicode_url_original_filename\n";
     281
     282        # Allow for possibility of raw byte version and Unicode versions of file
     283        $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
     284
     285    }
     286
    258287    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
    259     $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
    260288    }
    261289}
     
    266294# string already exists in the local folder.
    267295#
    268 # Is the following still true??
    269 # Return the original filename corresponding to the parameter URL-encoded filename, and
    270 # a decoded flag that is set to true iff URL-decoding had to be applied.
    271296sub opt_url_decode {
    272297    my $self = shift (@_);
    273     my ($link) = @_;
     298    my ($raw_link) = @_;
     299
    274300
    275301    # Replace %XX's in URL with decoded value if required.
    276302    # Note that the filename may include the %XX in some situations
    277     if ($link =~ m/\%[A-F0-9]{2}/i) {
    278     if (!-e $link) {
    279         $link = &unicode::url_decode($link);
     303
     304##    if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
     305
     306    if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
     307    if (!-e $raw_link) {
     308        $raw_link = &unicode::url_decode($raw_link,1);
    280309    }
    281310    }
    282 
    283     return $link;
     311   
     312    return $raw_link;
    284313}
    285314
     
    367396   
    368397    # Need to make sure there is a '/' on the end of upgraded_base_dir
    369     if ($upgraded_base_dir !~ m/\/$/) {
     398    if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
    370399        $upgraded_base_dir .= "/";
    371400    }
     
    418447#    my $utf8_file = $self->filename_to_utf8_metadata($file);
    419448#    $utf8_file =~ s/&\#095;/_/g;
    420     my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
    421 
     449#    variable below used to be utf8_file   
     450    my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
     451    my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
    422452
    423453    my $web_url = "http://";
     454    my $utf8_web_url = "http://";
    424455    if(defined $dirname) { # local directory
    425456        # Check for "ftp" in the domain name of the directory
     
    432463    {
    433464      $web_url = "ftp://";
     465      $utf8_web_url = "ftp://";
    434466    }
    435467    $dirname = $self->eval_dir_dots($dirname);
    436468    $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
    437     $web_url = $web_url.$dirname.$utf8_file;
     469
     470    $web_url = $web_url.$dirname.$url_encoded_file;
     471    $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
    438472    } else {
    439     $web_url = $web_url.$utf8_file;
     473    $web_url = $web_url.$url_encoded_file;
     474    $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
    440475    }
    441476    $web_url =~ s/\\/\//g;
     477    $utf8_web_url =~ s/\\/\//g;
    442478
    443479    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    444     print STDERR "******* upgraded_file:       $upgraded_file\n";
    445     print STDERR "******* adding URL metadata: $utf8_file\n";
     480    print STDERR "*******DEBUG: upgraded_file:       $upgraded_file\n";
     481    print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
    446482    }
    447483
    448484
    449485    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
     486    $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
    450487
    451488    if ($self->{'file_is_url'}) {
     
    590627    }
    591628
     629
    592630    # single section document
    593631    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
     
    664702##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    665703
     704    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
     705    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     706
    666707    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
    667708        $self->replace_usemap_links($1, $2, $3)/isge;
    668709
    669710##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    670 
    671     $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
    672         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
     711   
     712    $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
     713        $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
    673714    }
    674715
     
    753794sub replace_href_links {
    754795    my $self = shift (@_);
    755     my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
     796    my ($opt_open_comm, $front, $link, $back, $opt_close_comm, $base_dir, $file, $doc_obj, $section) = @_;
     797
     798    if ((defined $opt_open_comm) && (defined $opt_close_comm)) {
     799    # href link was embedded in <!-- comments -->
     800### print STDERR "****** Link in comment, skipping $link\n";
     801    return $opt_open_comm . $front . $link . $back . $opt_close_comm;
     802    }
    756803
    757804    # remove quotes from link at start and end if necessary
     
    819866    }
    820867
    821     $href = &unicode::raw_filename_to_url_encoded($href);
     868    $href = &unicode::raw_filename_to_utf8_url_encoded($href);
    822869    $href = &unicode::filename_to_url($href);
    823870
     
    825872
    826873    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    827         print STDERR "****** href=$href\n";   
     874        print STDERR "******DEBUG: href=$href\n";   
    828875    }
    829876
     
    844891    my $filename = $href;
    845892    if ($base_dir eq "") {
    846     # remove http:/ thereby leaving one slash at the start
    847     $filename =~ s/^[^:]*:\///;
     893    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     894        # remove http://
     895        $filename =~ s/^[^:]*:\/\///;
     896    }
     897    else {
     898        # remove http:/ thereby leaving one slash at the start as
     899        # part of full pathname
     900        $filename =~ s/^[^:]*:\///;
     901    }
    848902    }
    849903    else {
     
    863917    # file's name was in URL encoding, the following method will not decode
    864918    # it.
    865     my $utf8_filename = $filename;
    866     my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
     919    my $unicode_filename = $filename;
     920    my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
     921
     922    # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
     923    # (it seems) accompanying file
     924    if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
    867925
    868926    my $content_encoding= $self->{'content_encoding'} || "utf8";
    869927
    870     # The filenames that come through the HTML file have been decoded
    871     # into Unicode aware Perl strings.  Need to convert them back
    872     # to their initial raw-byte encoding to match the file that
    873     # exists on the file system
    874     $filename = encode($content_encoding, $opt_decode_utf8_filename);
     928    if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
     929    # The filenames that come through the HTML file have been decoded
     930    # into Unicode aware Perl strings.  Need to convert them back
     931    # to their initial raw-byte encoding to match the file that
     932    # exists on the file system
     933    $filename = encode($content_encoding, $opt_decode_unicode_filename);
     934    }
     935    elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
     936    # HFS+ is UTF8 with decompostion
     937    $filename = encode($content_encoding, $opt_decode_unicode_filename);
     938    $filename = normalize('D', $filename); # Normalization Form D (decomposition)
     939
     940    }
     941    elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
     942    my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
     943
     944    if (defined $long_filename) {
     945        my $short_filename = Win32::GetLongPathName($long_filename);
     946        $filename = $short_filename;
     947    }
     948#   else {
     949#       print STDERR "***** failed to map href to real file:\n";
     950#       print STDERR "****** $href -> $opt_decode_unicode_filename\n";
     951#   }
     952    }
     953    else {
     954    my $outhandle = $self->{'outhandle'};
     955    print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
     956    print $outhandle "         in file system encoding of href: $href\n";
     957    print $outhandle "         No character encoding done.\n";
     958    }
     959
    875960
    876961    # some special processing if the intended filename was converted to utf8, but
     
    882967    }
    883968
    884 ##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
    885 
    886     my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
     969##  print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
     970
     971    my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
    887972
    888973    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    889         print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n";
     974        print STDERR "******   From lookup unicode_filename, now trying for: $original_filename\n";
    890975    }
    891976
     
    9281013    return "_httpdocimg_/$newname";
    9291014    } else {
    930     if(&unicode::is_url_encoded($utf8_filename)) {
     1015    if(&unicode::is_url_encoded($unicode_filename)) {
    9311016        # use the possibly-decoded filename instead to avoid double URL encoding
    9321017        ($newname) = $filename =~ m/([^\/\\]*)$/;
    9331018    } else {
    934         ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
     1019        ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
    9351020    }
    9361021
     
    10341119
    10351120    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
     1121
     1122
     1123#   print STDERR "**** linkfilename = $linkfilename\n";
     1124#   if (!&util::fd_exists($linkfilename)) {
     1125#       print STDERR "***** Warning: Could not find $linkfilename\n";
     1126#   }
     1127
     1128
    10361129    # make sure there's a slash on the end if it's a directory
    10371130    if ($before_hash !~ m/\/$/) {
    10381131        $before_hash .= "/" if (-d $linkfilename);
    10391132    }
     1133
     1134#   print STDERR "*** returning: $before_hash\n";
     1135
    10401136    return ("http://" . $before_hash, $hash_part, 1);
    10411137    } else {
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r23363 r23387  
    307307    $reader->set_encoding($encoding);
    308308    $reader->decode_text($raw_text,$textref);
     309
     310    # At this point $$textref is a binary byte string
     311    # => turn it into a Unicode aware string, so full
     312    # Unicode aware pattern matching can be used.
     313    # For instance: 's/\x{0101}//g' or '[[:upper:]]'   
     314   
     315    $$textref = decode("utf8",$$textref);
    309316    }
    310317}
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r23363 r23387  
    645645    $utf8_real_filename =~ s/^\\(.*)/$1/i;
    646646
    647     my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
     647##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename);
     648    my $real_filename = $utf8_real_filename;
     649    $real_filename = &util::downgrade_if_dos_filename($real_filename);
    648650
    649651    if (-e $real_filename) {
     
    692694    $real_filename =~ s/^\\(.*)/$1/i;
    693695
    694     my $raw_filename;
    695     if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
    696         # Need to generate DOS version of filename to test in '-e $raw_filename' below
    697         require Win32;
    698         my $unicode_filename = Encode::decode("utf8",$real_filename);
    699 ###     print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n";
    700 
    701         $raw_filename = Win32::GetShortPathName($unicode_filename);
    702     }
    703     else {
    704         $raw_filename = $real_filename;
    705     }
     696    my $raw_filename = &util::downgrade_if_dos_filename($real_filename);
    706697
    707698    if (-e $raw_filename) {
  • main/trunk/greenstone2/perllib/unicode.pm

    r23371 r23387  
    627627
    628628sub url_decode {
    629     my ($text) = @_;
     629    my ($text,$and_numeric_entities) = @_;
    630630
    631631    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
    632     $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
    633     $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
     632
     633    if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
     634    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
     635    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
     636    }
    634637
    635638    return $text;
     
    773776}
    774777
    775 
    776778sub url_encoded_to_raw_filename
    777779{
     
    787789}
    788790
     791
     792sub raw_filename_to_utf8_url_encoded
     793{
     794    my ($str_in) = @_;
     795
     796    $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
     797
     798    my @url_encoded_chars
     799    = map { $_ > 128 ?                  # Representable in %XX form
     800            sprintf("%%%2X", $_) : 
     801            chr($_)                 # otherwise, Ascii char
     802        } unpack("U*", $str_in); # Unpack utf8 characters
     803
     804   
     805    my $str_out = join("", @url_encoded_chars);
     806
     807    return $str_out;
     808
     809}
     810
     811sub utf8_url_encoded_to_raw_filename
     812{
     813    my ($str_in) = @_;
     814
     815    my $utf8_str_out = $str_in;
     816
     817    $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
     818
     819    my $unicode_str_out = decode("utf8",$utf8_str_out);
     820    my $raw_str_out = utf8::downgrade($unicode_str_out);
     821   
     822    return $raw_str_out;
     823}
     824
     825sub analyze_raw_string
     826{
     827    my ($str_in) = @_;
     828
     829    my $uses_bytecodes = 0;
     830    my $exceeds_bytecodes = 0;
     831
     832    map { $exceeds_bytecodes = 1 if ($_ >= 256);
     833      $uses_bytecodes    = 1 if (($_ >= 128) && ($_ < 256));
     834    } unpack("U*", $str_in); # Unpack Unicode characters
     835
     836    return ($uses_bytecodes,$exceeds_bytecodes);
     837}
     838
     839
    7898401;
Note: See TracChangeset for help on using the changeset viewer.