Changeset 23387

Show
Ignore:
Timestamp:
06.12.2010 13:15:10 (9 years ago)
Author:
davidb
Message:

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

Location:
main/trunk/greenstone2/perllib
Files:
8 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r23371 r23387  
    515515        $field =~ s/^ex\.//; 
    516516 
    517         # special case for URL metadata 
    518         if ($field =~ m/^URL$/i) { 
     517        # special case for UTF8URL metadata 
     518        if ($field =~ m/^UTF8URL$/i) { 
    519519            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,  
    520520                        $value, { 'section' => [ $section_OID ] }); 
  • main/trunk/greenstone2/perllib/doc.pm

    r23362 r23387  
    10891089    $self->delete_assoc_file ($assoc_filename); 
    10901090     
     1091    if (!&util::fd_exists($real_filename)) { 
     1092    print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n"; 
     1093    exit -1; 
     1094    } 
     1095#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n"; 
     1096#    print STDERR "****##### so, ensure it is before storing?!?!?\n"; 
     1097##    my $utf8_filename = Encode::encode("utf8",$filename); 
     1098 
    10911099    push (@{$self->{'associated_files'}},  
    10921100      [$real_filename, $assoc_filename, $mime_type, $section]); 
  • main/trunk/greenstone2/perllib/plugins/BasePlugin.pm

    r23364 r23387  
    927927 
    928928    # UTF-8 version of filename 
    929     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    930     print STDERR "**** Setting Source Metadata given: $octet_file\n"; 
    931     } 
     929#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     930#   print STDERR "**** Setting Source Metadata given: $octet_file\n"; 
     931#    } 
    932932     
    933933    # Deal with (on Windows) raw filenames that are in their 
     
    937937    if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) { 
    938938        if (-e $raw_filename) { 
    939         require Win32; 
    940          
    941 ##      print STDERR "**** raw filename before LPN: $raw_filename\n"; 
    942939        my $unicode_filename = Win32::GetLongPathName($raw_filename); 
    943940         
    944941        my $unused_full_uf; 
    945942        ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename); 
    946  
    947 ##      print STDERR "**** raw filename after LPN: $raw_filename\n";         
    948943        } 
    949944    } 
     
    962957    } 
    963958     
    964     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    965     print STDERR "****** saving Source as:             $url_encoded_filename\n"; 
    966     } 
     959#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     960#   print STDERR "****** saving Source as:             $url_encoded_filename\n"; 
     961#    } 
    967962     
    968963     
     
    978973                    $renamed_raw_url); 
    979974 
    980     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    981     print STDERR "****** saving SourceFile as:         $renamed_raw_url\n"; 
    982     } 
     975#    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     976#   print STDERR "****** saving SourceFile as:         $renamed_raw_url\n"; 
     977#    } 
    983978} 
    984979    
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r23363 r23387  
    226226    my $convert_to_ext = $self->{'convert_to_ext'}; 
    227227     
     228 
     229    my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename); 
     230 
    228231    # derive tmp filename from input filename 
    229232    my ($tailname, $dirname, $suffix) 
    230     = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$"); 
     233    = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$"); 
    231234 
    232235    # softlink to collection tmp dir 
     
    238241    } 
    239242     
    240     # convert to utf-8 otherwise we have problems with the doc.xml file later on 
    241 #    print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname); 
    242     $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname); 
     243#    # convert to utf-8 otherwise we have problems with the doc.xml file later on 
     244#    my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname); 
     245 
     246    # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file 
     247     my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname); 
     248 
    243249 
    244250    # URLEncode this since htmls with images where the html filename is utf8 don't seem 
    245251    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded 
    246252    # files on the filesystem. 
    247     $tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix"); 
    248  
    249     $suffix = lc($suffix); 
    250     my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$suffix"); 
     253    $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix"); 
     254 
     255    my $lc_suffix = lc($suffix); 
     256    my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix"); 
    251257     
    252258    # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix 
     
    324330    } 
    325331    } else { 
    326     $output_filename =~ s/$suffix$/.$output_type/; 
     332    $output_filename =~ s/$lc_suffix$/.$output_type/; 
    327333    } 
    328334     
     
    446452    # need to check that not empty 
    447453    my ($doc_ext) = $file =~ /\.(\w+)$/; 
     454    $doc_ext = lc($doc_ext); 
    448455    my $file_type = "unknown"; 
    449456    $file_type = $self->{'file_type'} if defined $self->{'file_type'}; 
     
    458465    $assocfilename = $doc_obj->get_assocfile_from_sourcefile(); 
    459466    } 
     467 
    460468    $doc_obj->associate_file($filename, $assocfilename, undef, $cursection); 
    461469 
     
    476484     my $tmp_dir = $self->{'tmp_dir'}; 
    477485     if (defined $tmp_dir && -d $tmp_dir) { 
     486##   print STDERR "**** Supressing clean up of tmp dir\n"; 
    478487     &util::rm_r($tmp_dir); 
    479488     $self->{'tmp_dir'} = undef; 
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23371 r23387  
    3737 
    3838use Encode; 
     39use Unicode::Normalize 'normalize'; 
    3940 
    4041use ReadTextFile; 
     
    206207    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
    207208 
    208     if(!defined $self->{'utf8_to_original_filename'}) {  
     209    if(!defined $self->{'unicode_to_original_filename'}) {  
    209210    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 
    210     $self->{'utf8_to_original_filename'} = {}; 
    211     } 
    212  
    213     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 
     211    $self->{'unicode_to_original_filename'} = {}; 
     212    } 
     213 
     214    foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 
    214215 
    215216    # remove quotes from link at start and end if necessary 
    216     if ($link =~ m/^\"/) { 
    217         $link =~ s/^\"//; 
    218         $link =~ s/\"$//; 
     217    if ($raw_link =~ m/^\"/) { 
     218        $raw_link =~ s/^\"//; 
     219        $raw_link =~ s/\"$//; 
    219220    } 
    220221 
    221222    # remove any anchor names, e.g. foo.html#name becomes foo.html  
    222223    # but watch out for any #'s that are part of entities, such as &#x3B1; 
    223     $link =~ s/([^&])\#.*$/$1/s;  
     224    $raw_link =~ s/([^&])\#.*$/$1/s;  
    224225 
    225226    # some links may just be anchor names 
    226     next unless ($link =~ /\S+/); 
    227  
    228     if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) { 
     227    next unless ($raw_link =~ /\S+/); 
     228 
     229    if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/) { 
    229230        # Turn relative file path into full path 
    230231        my $dirname = &File::Basename::dirname($filename_full_path); 
    231         $link = &util::filename_cat($dirname, $link); 
    232     } 
    233     $link = $self->eval_dir_dots($link); 
     232        $raw_link = &util::filename_cat($dirname, $raw_link); 
     233    } 
     234    $raw_link = $self->eval_dir_dots($raw_link); 
    234235 
    235236    # this is the actual filename on the filesystem (that the link refers to) 
    236     my $url_original_filename = $self->opt_url_decode($link); 
    237  
    238     # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 
    239     my $utf8_link = ""; 
    240     $self->decode_text($link,$content_encoding,$language,\$utf8_link); 
    241  
    242     $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 
    243 #   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 
    244  
    245     if ($url_original_filename ne $utf8_link) { 
     237    my $url_original_filename = $self->opt_url_decode($raw_link); 
     238 
     239    my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename); 
     240 
     241    if ($exceeds_bytecodes) { 
     242        # We have a link to a file name that is more complicated than a raw byte filename 
     243        # What we do next depends on the operating system we are on 
     244 
     245        if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) { 
     246        # Assume we're dealing with a UTF-8 encoded filename 
     247        $url_original_filename = encode("utf8", $url_original_filename); 
     248        } 
     249        elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) { 
     250        # HFS+ is UTF8 with decompostion 
     251        $url_original_filename = encode("utf8", $url_original_filename); 
     252        $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)      
     253        } 
     254        elsif ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     255        # Don't need to do anything as later code maps Windows 
     256        # unicode filenames to DOS short filenames when needed       
     257        } 
     258        else { 
     259        my $outhandle = $self->{'outhandle'}; 
     260        print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n"; 
     261        print $outhandle "         in raw file system encoding of: $raw_link\n"; 
     262        print $outhandle "         Assuming filesystem is UTF-8 based.\n"; 
     263        $url_original_filename = encode("utf8", $url_original_filename); 
     264        } 
     265    } 
     266 
     267    # Convert the (currently raw) link into its Unicode version.  
     268    # Store the Unicode link along with the url_original_filename 
     269    my $unicode_url_original_filename = ""; 
     270    $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename); 
     271 
     272 
     273    $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename; 
     274 
     275 
     276    if ($url_original_filename ne $unicode_url_original_filename) { 
    246277        my $outhandle = $self->{'outhandle'}; 
    247  
     278         
    248279        print $outhandle "URL Encoding $url_original_filename\n"; 
    249         print $outhandle " ->$utf8_link\n"; 
    250     } 
    251  
    252      
    253     my $unicode_url_original_filename = decode("utf8",$url_original_filename); 
    254  
    255 ##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n"; 
    256  
    257     # Allow for possibility of raw byte version (UTF8) and Unicode versions of file 
     280        print $outhandle " ->$unicode_url_original_filename\n"; 
     281 
     282        # Allow for possibility of raw byte version and Unicode versions of file 
     283        $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1; 
     284 
     285    } 
     286 
    258287    $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 
    259     $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1; 
    260288    } 
    261289} 
     
    266294# string already exists in the local folder. 
    267295# 
    268 # Is the following still true?? 
    269 # Return the original filename corresponding to the parameter URL-encoded filename, and 
    270 # a decoded flag that is set to true iff URL-decoding had to be applied. 
    271296sub opt_url_decode { 
    272297    my $self = shift (@_); 
    273     my ($link) = @_; 
     298    my ($raw_link) = @_; 
     299 
    274300 
    275301    # Replace %XX's in URL with decoded value if required. 
    276302    # Note that the filename may include the %XX in some situations 
    277     if ($link =~ m/\%[A-F0-9]{2}/i) { 
    278     if (!-e $link) { 
    279         $link = &unicode::url_decode($link); 
     303 
     304##    if ($raw_link =~ m/\%[A-F0-9]{2}/i) { 
     305 
     306    if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) { 
     307    if (!-e $raw_link) { 
     308        $raw_link = &unicode::url_decode($raw_link,1); 
    280309    } 
    281310    }  
    282  
    283     return $link; 
     311     
     312    return $raw_link; 
    284313} 
    285314 
     
    367396     
    368397    # Need to make sure there is a '/' on the end of upgraded_base_dir 
    369     if ($upgraded_base_dir !~ m/\/$/) { 
     398    if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) { 
    370399        $upgraded_base_dir .= "/"; 
    371400    } 
     
    418447#    my $utf8_file = $self->filename_to_utf8_metadata($file); 
    419448#    $utf8_file =~ s/&\#095;/_/g; 
    420     my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 
    421  
     449#    variable below used to be utf8_file     
     450    my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname); 
     451    my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname); 
    422452 
    423453    my $web_url = "http://"; 
     454    my $utf8_web_url = "http://"; 
    424455    if(defined $dirname) { # local directory 
    425456        # Check for "ftp" in the domain name of the directory 
     
    432463    { 
    433464      $web_url = "ftp://"; 
     465      $utf8_web_url = "ftp://"; 
    434466    } 
    435467    $dirname = $self->eval_dir_dots($dirname); 
    436468    $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/" 
    437     $web_url = $web_url.$dirname.$utf8_file;  
     469 
     470    $web_url = $web_url.$dirname.$url_encoded_file;  
     471    $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;  
    438472    } else { 
    439     $web_url = $web_url.$utf8_file; 
     473    $web_url = $web_url.$url_encoded_file; 
     474    $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file; 
    440475    } 
    441476    $web_url =~ s/\\/\//g; 
     477    $utf8_web_url =~ s/\\/\//g; 
    442478 
    443479    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    444     print STDERR "******* upgraded_file:       $upgraded_file\n"; 
    445     print STDERR "******* adding URL metadata: $utf8_file\n"; 
     480    print STDERR "*******DEBUG: upgraded_file:       $upgraded_file\n"; 
     481    print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n"; 
    446482    } 
    447483 
    448484 
    449485    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 
     486    $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url); 
    450487 
    451488    if ($self->{'file_is_url'}) { 
     
    590627    } 
    591628 
     629 
    592630    # single section document 
    593631    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 
     
    664702##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 
    665703 
     704    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))'; 
     705    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)'; 
     706 
    666707    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 
    667708        $self->replace_usemap_links($1, $2, $3)/isge; 
    668709 
    669710##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 
    670  
    671     $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 
    672         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 
     711     
     712    $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/ 
     713        $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge; 
    673714    } 
    674715 
     
    753794sub replace_href_links { 
    754795    my $self = shift (@_); 
    755     my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_; 
     796    my ($opt_open_comm, $front, $link, $back, $opt_close_comm, $base_dir, $file, $doc_obj, $section) = @_; 
     797 
     798    if ((defined $opt_open_comm) && (defined $opt_close_comm)) { 
     799    # href link was embedded in <!-- comments --> 
     800### print STDERR "****** Link in comment, skipping $link\n"; 
     801    return $opt_open_comm . $front . $link . $back . $opt_close_comm; 
     802    } 
    756803 
    757804    # remove quotes from link at start and end if necessary 
     
    819866    } 
    820867 
    821     $href = &unicode::raw_filename_to_url_encoded($href); 
     868    $href = &unicode::raw_filename_to_utf8_url_encoded($href); 
    822869    $href = &unicode::filename_to_url($href); 
    823870 
     
    825872 
    826873    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    827         print STDERR "****** href=$href\n";     
     874        print STDERR "******DEBUG: href=$href\n";     
    828875    } 
    829876 
     
    844891    my $filename = $href; 
    845892    if ($base_dir eq "") { 
    846     # remove http:/ thereby leaving one slash at the start 
    847     $filename =~ s/^[^:]*:\///; 
     893    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     894        # remove http:// 
     895        $filename =~ s/^[^:]*:\/\///; 
     896    } 
     897    else { 
     898        # remove http:/ thereby leaving one slash at the start as 
     899        # part of full pathname 
     900        $filename =~ s/^[^:]*:\///; 
     901    } 
    848902    } 
    849903    else { 
     
    863917    # file's name was in URL encoding, the following method will not decode 
    864918    # it. 
    865     my $utf8_filename = $filename; 
    866     my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename); 
     919    my $unicode_filename = $filename; 
     920    my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename); 
     921 
     922    # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no 
     923    # (it seems) accompanying file 
     924    if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; } 
    867925 
    868926    my $content_encoding= $self->{'content_encoding'} || "utf8"; 
    869927 
    870     # The filenames that come through the HTML file have been decoded 
    871     # into Unicode aware Perl strings.  Need to convert them back 
    872     # to their initial raw-byte encoding to match the file that 
    873     # exists on the file system 
    874     $filename = encode($content_encoding, $opt_decode_utf8_filename); 
     928    if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) { 
     929    # The filenames that come through the HTML file have been decoded 
     930    # into Unicode aware Perl strings.  Need to convert them back 
     931    # to their initial raw-byte encoding to match the file that 
     932    # exists on the file system 
     933    $filename = encode($content_encoding, $opt_decode_unicode_filename); 
     934    } 
     935    elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) { 
     936    # HFS+ is UTF8 with decompostion 
     937    $filename = encode($content_encoding, $opt_decode_unicode_filename); 
     938    $filename = normalize('D', $filename); # Normalization Form D (decomposition) 
     939 
     940    } 
     941    elsif ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     942    my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename); 
     943 
     944    if (defined $long_filename) { 
     945        my $short_filename = Win32::GetLongPathName($long_filename); 
     946        $filename = $short_filename; 
     947    } 
     948#   else { 
     949#       print STDERR "***** failed to map href to real file:\n"; 
     950#       print STDERR "****** $href -> $opt_decode_unicode_filename\n"; 
     951#   } 
     952    } 
     953    else { 
     954    my $outhandle = $self->{'outhandle'}; 
     955    print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n"; 
     956    print $outhandle "         in file system encoding of href: $href\n"; 
     957    print $outhandle "         No character encoding done.\n"; 
     958    } 
     959 
    875960 
    876961    # some special processing if the intended filename was converted to utf8, but 
     
    882967    } 
    883968 
    884 ##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n"; 
    885  
    886     my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename}; 
     969##  print STDERR "**** trying to look up unicode_filename: $unicode_filename\n"; 
     970 
     971    my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename}; 
    887972 
    888973    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    889         print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n"; 
     974        print STDERR "******   From lookup unicode_filename, now trying for: $original_filename\n"; 
    890975    } 
    891976 
     
    9281013    return "_httpdocimg_/$newname"; 
    9291014    } else { 
    930     if(&unicode::is_url_encoded($utf8_filename)) { 
     1015    if(&unicode::is_url_encoded($unicode_filename)) { 
    9311016        # use the possibly-decoded filename instead to avoid double URL encoding 
    9321017        ($newname) = $filename =~ m/([^\/\\]*)$/; 
    9331018    } else { 
    934         ($newname) = $utf8_filename =~ m/([^\/\\]*)$/; 
     1019        ($newname) = $unicode_filename =~ m/([^\/\\]*)$/; 
    9351020    } 
    9361021 
     
    10341119 
    10351120    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);  
     1121 
     1122 
     1123#   print STDERR "**** linkfilename = $linkfilename\n"; 
     1124#   if (!&util::fd_exists($linkfilename)) { 
     1125#       print STDERR "***** Warning: Could not find $linkfilename\n"; 
     1126#   } 
     1127 
     1128 
    10361129    # make sure there's a slash on the end if it's a directory 
    10371130    if ($before_hash !~ m/\/$/) { 
    10381131        $before_hash .= "/" if (-d $linkfilename); 
    10391132    } 
     1133 
     1134#   print STDERR "*** returning: $before_hash\n"; 
     1135 
    10401136    return ("http://" . $before_hash, $hash_part, 1); 
    10411137    } else { 
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r23363 r23387  
    307307    $reader->set_encoding($encoding); 
    308308    $reader->decode_text($raw_text,$textref); 
     309 
     310    # At this point $$textref is a binary byte string 
     311    # => turn it into a Unicode aware string, so full 
     312    # Unicode aware pattern matching can be used. 
     313    # For instance: 's/\x{0101}//g' or '[[:upper:]]'     
     314     
     315    $$textref = decode("utf8",$$textref); 
    309316    } 
    310317} 
  • main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm

    r23363 r23387  
    645645    $utf8_real_filename =~ s/^\\(.*)/$1/i; 
    646646 
    647     my $real_filename = &util::utf8_to_real_filename($utf8_real_filename); 
     647##  my $real_filename = &util::utf8_to_real_filename($utf8_real_filename); 
     648    my $real_filename = $utf8_real_filename; 
     649    $real_filename = &util::downgrade_if_dos_filename($real_filename); 
    648650 
    649651    if (-e $real_filename) { 
     
    692694    $real_filename =~ s/^\\(.*)/$1/i; 
    693695 
    694     my $raw_filename; 
    695     if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
    696         # Need to generate DOS version of filename to test in '-e $raw_filename' below 
    697         require Win32; 
    698         my $unicode_filename = Encode::decode("utf8",$real_filename); 
    699 ###     print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n"; 
    700  
    701         $raw_filename = Win32::GetShortPathName($unicode_filename); 
    702     } 
    703     else { 
    704         $raw_filename = $real_filename; 
    705     } 
     696    my $raw_filename = &util::downgrade_if_dos_filename($real_filename); 
    706697 
    707698    if (-e $raw_filename) { 
  • main/trunk/greenstone2/perllib/unicode.pm

    r23371 r23387  
    627627 
    628628sub url_decode { 
    629     my ($text) = @_; 
     629    my ($text,$and_numeric_entities) = @_; 
    630630 
    631631    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige; 
    632     $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige; 
    633     $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige; 
     632 
     633    if ((defined $and_numeric_entities) && ($and_numeric_entities)) { 
     634    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige; 
     635    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige; 
     636    } 
    634637 
    635638    return $text; 
     
    773776} 
    774777 
    775  
    776778sub url_encoded_to_raw_filename 
    777779{ 
     
    787789} 
    788790 
     791 
     792sub raw_filename_to_utf8_url_encoded 
     793{ 
     794    my ($str_in) = @_; 
     795 
     796    $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in); 
     797 
     798    my @url_encoded_chars 
     799    = map { $_ > 128 ?                  # Representable in %XX form 
     800            sprintf("%%%2X", $_) :   
     801            chr($_)                 # otherwise, Ascii char 
     802        } unpack("U*", $str_in); # Unpack utf8 characters 
     803 
     804     
     805    my $str_out = join("", @url_encoded_chars); 
     806 
     807    return $str_out; 
     808 
     809} 
     810 
     811sub utf8_url_encoded_to_raw_filename 
     812{ 
     813    my ($str_in) = @_; 
     814 
     815    my $utf8_str_out = $str_in; 
     816 
     817    $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig; 
     818 
     819    my $unicode_str_out = decode("utf8",$utf8_str_out); 
     820    my $raw_str_out = utf8::downgrade($unicode_str_out); 
     821     
     822    return $raw_str_out; 
     823} 
     824 
     825sub analyze_raw_string 
     826{ 
     827    my ($str_in) = @_; 
     828 
     829    my $uses_bytecodes = 0; 
     830    my $exceeds_bytecodes = 0; 
     831 
     832    map { $exceeds_bytecodes = 1 if ($_ >= 256); 
     833      $uses_bytecodes    = 1 if (($_ >= 128) && ($_ < 256)); 
     834    } unpack("U*", $str_in); # Unpack Unicode characters 
     835 
     836    return ($uses_bytecodes,$exceeds_bytecodes); 
     837} 
     838 
     839 
    7898401;