Ignore:
Timestamp:
2010-12-06T13:15:10+13:00 (13 years ago)
Author:
davidb
Message:

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23371 r23387  
    3737
    3838use Encode;
     39use Unicode::Normalize 'normalize';
    3940
    4041use ReadTextFile;
     
    206207    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    207208
    208     if(!defined $self->{'utf8_to_original_filename'}) {
     209    if(!defined $self->{'unicode_to_original_filename'}) {
    209210    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
    210     $self->{'utf8_to_original_filename'} = {};
    211     }
    212 
    213     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
     211    $self->{'unicode_to_original_filename'} = {};
     212    }
     213
     214    foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
    214215
    215216    # remove quotes from link at start and end if necessary
    216     if ($link =~ m/^\"/) {
    217         $link =~ s/^\"//;
    218         $link =~ s/\"$//;
     217    if ($raw_link =~ m/^\"/) {
     218        $raw_link =~ s/^\"//;
     219        $raw_link =~ s/\"$//;
    219220    }
    220221
    221222    # remove any anchor names, e.g. foo.html#name becomes foo.html
    222223    # but watch out for any #'s that are part of entities, such as &#x3B1;
    223     $link =~ s/([^&])\#.*$/$1/s;
     224    $raw_link =~ s/([^&])\#.*$/$1/s;
    224225
    225226    # some links may just be anchor names
    226     next unless ($link =~ /\S+/);
    227 
    228     if ($link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {
     227    next unless ($raw_link =~ /\S+/);
     228
     229    if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/) {
    229230        # Turn relative file path into full path
    230231        my $dirname = &File::Basename::dirname($filename_full_path);
    231         $link = &util::filename_cat($dirname, $link);
    232     }
    233     $link = $self->eval_dir_dots($link);
     232        $raw_link = &util::filename_cat($dirname, $raw_link);
     233    }
     234    $raw_link = $self->eval_dir_dots($raw_link);
    234235
    235236    # this is the actual filename on the filesystem (that the link refers to)
    236     my $url_original_filename = $self->opt_url_decode($link);
    237 
    238     # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
    239     my $utf8_link = "";
    240     $self->decode_text($link,$content_encoding,$language,\$utf8_link);
    241 
    242     $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
    243 #   print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
    244 
    245     if ($url_original_filename ne $utf8_link) {
     237    my $url_original_filename = $self->opt_url_decode($raw_link);
     238
     239    my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename);
     240
     241    if ($exceeds_bytecodes) {
     242        # We have a link to a file name that is more complicated than a raw byte filename
     243        # What we do next depends on the operating system we are on
     244
     245        if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
     246        # Assume we're dealing with a UTF-8 encoded filename
     247        $url_original_filename = encode("utf8", $url_original_filename);
     248        }
     249        elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
     250        # HFS+ is UTF8 with decompostion
     251        $url_original_filename = encode("utf8", $url_original_filename);
     252        $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition)     
     253        }
     254        elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
     255        # Don't need to do anything as later code maps Windows
     256        # unicode filenames to DOS short filenames when needed     
     257        }
     258        else {
     259        my $outhandle = $self->{'outhandle'};
     260        print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
     261        print $outhandle "         in raw file system encoding of: $raw_link\n";
     262        print $outhandle "         Assuming filesystem is UTF-8 based.\n";
     263        $url_original_filename = encode("utf8", $url_original_filename);
     264        }
     265    }
     266
     267    # Convert the (currently raw) link into its Unicode version.
     268    # Store the Unicode link along with the url_original_filename
     269    my $unicode_url_original_filename = "";
     270    $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename);
     271
     272
     273    $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename;
     274
     275
     276    if ($url_original_filename ne $unicode_url_original_filename) {
    246277        my $outhandle = $self->{'outhandle'};
    247 
     278       
    248279        print $outhandle "URL Encoding $url_original_filename\n";
    249         print $outhandle " ->$utf8_link\n";
    250     }
    251 
    252    
    253     my $unicode_url_original_filename = decode("utf8",$url_original_filename);
    254 
    255 ##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
    256 
    257     # Allow for possibility of raw byte version (UTF8) and Unicode versions of file
     280        print $outhandle " ->$unicode_url_original_filename\n";
     281
     282        # Allow for possibility of raw byte version and Unicode versions of file
     283        $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
     284
     285    }
     286
    258287    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
    259     $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
    260288    }
    261289}
     
    266294# string already exists in the local folder.
    267295#
    268 # Is the following still true??
    269 # Return the original filename corresponding to the parameter URL-encoded filename, and
    270 # a decoded flag that is set to true iff URL-decoding had to be applied.
    271296sub opt_url_decode {
    272297    my $self = shift (@_);
    273     my ($link) = @_;
     298    my ($raw_link) = @_;
     299
    274300
    275301    # Replace %XX's in URL with decoded value if required.
    276302    # Note that the filename may include the %XX in some situations
    277     if ($link =~ m/\%[A-F0-9]{2}/i) {
    278     if (!-e $link) {
    279         $link = &unicode::url_decode($link);
     303
     304##    if ($raw_link =~ m/\%[A-F0-9]{2}/i) {
     305
     306    if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) {
     307    if (!-e $raw_link) {
     308        $raw_link = &unicode::url_decode($raw_link,1);
    280309    }
    281310    }
    282 
    283     return $link;
     311   
     312    return $raw_link;
    284313}
    285314
     
    367396   
    368397    # Need to make sure there is a '/' on the end of upgraded_base_dir
    369     if ($upgraded_base_dir !~ m/\/$/) {
     398    if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) {
    370399        $upgraded_base_dir .= "/";
    371400    }
     
    418447#    my $utf8_file = $self->filename_to_utf8_metadata($file);
    419448#    $utf8_file =~ s/&\#095;/_/g;
    420     my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
    421 
     449#    variable below used to be utf8_file   
     450    my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
     451    my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
    422452
    423453    my $web_url = "http://";
     454    my $utf8_web_url = "http://";
    424455    if(defined $dirname) { # local directory
    425456        # Check for "ftp" in the domain name of the directory
     
    432463    {
    433464      $web_url = "ftp://";
     465      $utf8_web_url = "ftp://";
    434466    }
    435467    $dirname = $self->eval_dir_dots($dirname);
    436468    $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
    437     $web_url = $web_url.$dirname.$utf8_file;
     469
     470    $web_url = $web_url.$dirname.$url_encoded_file;
     471    $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
    438472    } else {
    439     $web_url = $web_url.$utf8_file;
     473    $web_url = $web_url.$url_encoded_file;
     474    $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file;
    440475    }
    441476    $web_url =~ s/\\/\//g;
     477    $utf8_web_url =~ s/\\/\//g;
    442478
    443479    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    444     print STDERR "******* upgraded_file:       $upgraded_file\n";
    445     print STDERR "******* adding URL metadata: $utf8_file\n";
     480    print STDERR "*******DEBUG: upgraded_file:       $upgraded_file\n";
     481    print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
    446482    }
    447483
    448484
    449485    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
     486    $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url);
    450487
    451488    if ($self->{'file_is_url'}) {
     
    590627    }
    591628
     629
    592630    # single section document
    593631    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
     
    664702##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    665703
     704    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
     705    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     706
    666707    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
    667708        $self->replace_usemap_links($1, $2, $3)/isge;
    668709
    669710##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
    670 
    671     $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
    672         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
     711   
     712    $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/
     713        $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge;
    673714    }
    674715
     
    753794sub replace_href_links {
    754795    my $self = shift (@_);
    755     my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
     796    my ($opt_open_comm, $front, $link, $back, $opt_close_comm, $base_dir, $file, $doc_obj, $section) = @_;
     797
     798    if ((defined $opt_open_comm) && (defined $opt_close_comm)) {
     799    # href link was embedded in <!-- comments -->
     800### print STDERR "****** Link in comment, skipping $link\n";
     801    return $opt_open_comm . $front . $link . $back . $opt_close_comm;
     802    }
    756803
    757804    # remove quotes from link at start and end if necessary
     
    819866    }
    820867
    821     $href = &unicode::raw_filename_to_url_encoded($href);
     868    $href = &unicode::raw_filename_to_utf8_url_encoded($href);
    822869    $href = &unicode::filename_to_url($href);
    823870
     
    825872
    826873    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    827         print STDERR "****** href=$href\n";   
     874        print STDERR "******DEBUG: href=$href\n";   
    828875    }
    829876
     
    844891    my $filename = $href;
    845892    if ($base_dir eq "") {
    846     # remove http:/ thereby leaving one slash at the start
    847     $filename =~ s/^[^:]*:\///;
     893    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     894        # remove http://
     895        $filename =~ s/^[^:]*:\/\///;
     896    }
     897    else {
     898        # remove http:/ thereby leaving one slash at the start as
     899        # part of full pathname
     900        $filename =~ s/^[^:]*:\///;
     901    }
    848902    }
    849903    else {
     
    863917    # file's name was in URL encoding, the following method will not decode
    864918    # it.
    865     my $utf8_filename = $filename;
    866     my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename);
     919    my $unicode_filename = $filename;
     920    my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename);
     921
     922    # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no
     923    # (it seems) accompanying file
     924    if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; }
    867925
    868926    my $content_encoding= $self->{'content_encoding'} || "utf8";
    869927
    870     # The filenames that come through the HTML file have been decoded
    871     # into Unicode aware Perl strings.  Need to convert them back
    872     # to their initial raw-byte encoding to match the file that
    873     # exists on the file system
    874     $filename = encode($content_encoding, $opt_decode_utf8_filename);
     928    if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) {
     929    # The filenames that come through the HTML file have been decoded
     930    # into Unicode aware Perl strings.  Need to convert them back
     931    # to their initial raw-byte encoding to match the file that
     932    # exists on the file system
     933    $filename = encode($content_encoding, $opt_decode_unicode_filename);
     934    }
     935    elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
     936    # HFS+ is UTF8 with decompostion
     937    $filename = encode($content_encoding, $opt_decode_unicode_filename);
     938    $filename = normalize('D', $filename); # Normalization Form D (decomposition)
     939
     940    }
     941    elsif ($ENV{'GSDLOS'} =~ /^windows$/i) {
     942    my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename);
     943
     944    if (defined $long_filename) {
     945        my $short_filename = Win32::GetLongPathName($long_filename);
     946        $filename = $short_filename;
     947    }
     948#   else {
     949#       print STDERR "***** failed to map href to real file:\n";
     950#       print STDERR "****** $href -> $opt_decode_unicode_filename\n";
     951#   }
     952    }
     953    else {
     954    my $outhandle = $self->{'outhandle'};
     955    print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n";
     956    print $outhandle "         in file system encoding of href: $href\n";
     957    print $outhandle "         No character encoding done.\n";
     958    }
     959
    875960
    876961    # some special processing if the intended filename was converted to utf8, but
     
    882967    }
    883968
    884 ##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
    885 
    886     my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
     969##  print STDERR "**** trying to look up unicode_filename: $unicode_filename\n";
     970
     971    my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename};
    887972
    888973    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    889         print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n";
     974        print STDERR "******   From lookup unicode_filename, now trying for: $original_filename\n";
    890975    }
    891976
     
    9281013    return "_httpdocimg_/$newname";
    9291014    } else {
    930     if(&unicode::is_url_encoded($utf8_filename)) {
     1015    if(&unicode::is_url_encoded($unicode_filename)) {
    9311016        # use the possibly-decoded filename instead to avoid double URL encoding
    9321017        ($newname) = $filename =~ m/([^\/\\]*)$/;
    9331018    } else {
    934         ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
     1019        ($newname) = $unicode_filename =~ m/([^\/\\]*)$/;
    9351020    }
    9361021
     
    10341119
    10351120    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
     1121
     1122
     1123#   print STDERR "**** linkfilename = $linkfilename\n";
     1124#   if (!&util::fd_exists($linkfilename)) {
     1125#       print STDERR "***** Warning: Could not find $linkfilename\n";
     1126#   }
     1127
     1128
    10361129    # make sure there's a slash on the end if it's a directory
    10371130    if ($before_hash !~ m/\/$/) {
    10381131        $before_hash .= "/" if (-d $linkfilename);
    10391132    }
     1133
     1134#   print STDERR "*** returning: $before_hash\n";
     1135
    10401136    return ("http://" . $before_hash, $hash_part, 1);
    10411137    } else {
Note: See TracChangeset for help on using the changeset viewer.