Ignore:
Timestamp:
2010-12-01T11:42:27+13:00 (13 years ago)
Author:
davidb
Message:

Plugin code upgrade to support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23352 r23363  
    186186    # read in file ($text will be in utf8)
    187187    my $raw_text = "";
    188     $self->read_file_no_decoding ($filename_full_path, \$raw_text);
     188    $self->read_file_no_decoding($filename_full_path, \$raw_text);
    189189
    190190    my $textref = \$raw_text;
     
    192192    my $closecom = '(?:-->|(?:—|—|--)>)';
    193193    $$textref =~ s/$opencom(.*?)$closecom//gs;
     194
     195    # Convert entities to their UTF8 equivalents
     196    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
     197    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8
     198    $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go;
    194199
    195200    my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+";
     
    209214
    210215    # remove quotes from link at start and end if necessary
    211     if ($link=~/^\"/) {
    212         $link=~s/^\"//;
    213         $link=~s/\"$//;
    214     }
    215 
    216     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
     216    if ($link =~ m/^\"/) {
     217        $link =~ s/^\"//;
     218        $link =~ s/\"$//;
     219    }
     220
     221    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 
    217222    # some links may just be anchor names
    218223    next unless ($link =~ /\S+/);
     
    242247    }
    243248
     249   
     250    my $unicode_url_original_filename = decode("utf8",$url_original_filename);
     251
     252##  print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n";
     253
     254    # Allow for possibility of raw byte version (UTF8) and Unicode versions of file
    244255    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
     256    $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;
    245257    }
    246258}
     
    250262# filename*, it does not URL decode any filename if a file by the name of the *URL-encoded*
    251263# string already exists in the local folder.
     264#
     265# Is the following still true??
    252266# Return the original filename corresponding to the parameter URL-encoded filename, and
    253267# a decoded flag that is set to true iff URL-decoding had to be applied.
     
    312326    $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'});
    313327    ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental
    314     # build. so set it manually.
    315     $doc_obj->{'source_path'} = $filename_full_path;
     328    # build. So set it manually.
     329    $doc_obj->set_source_path($filename_full_path);
    316330    my $collect_conv_file = &util::filename_within_collection($tidy_filename);
    317331    $doc_obj->set_converted_filename($collect_conv_file);
     
    387401    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
    388402
    389     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    390         print STDERR "***!! file = $file\n";
    391         print STDERR "***!! utf8_file = $utf8_file\n";
    392     }
     403#   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     404#       print STDERR "***!! file = $file\n";
     405#       print STDERR "***!! utf8_file = $utf8_file\n";
     406#   }
    393407
    394408
     
    764778    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
    765779
    766 
    767780    # If web page didn't give encoding, then default to utf8
     781    my $content_encoding= $self->{'content_encoding'} || "utf8";
     782
    768783    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    769         print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n";
    770         print STDERR "*****  looking up $file\n";
    771     }
    772 
    773     my $content_encoding= $self->{'content_encoding'} || "utf8";
     784        print STDERR "*** Encoding with $content_encoding href: $href\n";
     785    }
     786
    774787    $href = encode($content_encoding,$href);
    775788
     
    807820
    808821    $filename = &util::filename_cat($base_dir, $filename);
     822
    809823    if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) {
    810824    # we are processing a tidytmp file - want paths to be in import
     
    827841    $filename = encode($content_encoding, $opt_decode_utf8_filename);
    828842
    829 
    830843    # some special processing if the intended filename was converted to utf8, but
    831844    # the actual file still needs to be renamed
    832     if (!-e $filename) {
     845    if (!&util::fd_exists($filename)) {
    833846    # try the original filename stored in map
    834847    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    835         print STDERR "***###!! orig filename did not exist: $filename\n";
    836     }
     848        print STDERR "******!! orig filename did not exist: $filename\n";
     849    }
     850
     851##  print STDERR "**** trying to look up utf8_filename: $utf8_filename\n";
    837852
    838853    my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename};
    839854
    840855    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    841         print STDERR "**** Trying for $original_filename\n";
     856        print STDERR "******   From lookup utf8_filename, now trying for: $original_filename\n";
    842857    }
    843858
    844859    if (defined $original_filename && -e $original_filename) {
    845860        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    846             print STDERR "*** found match\n";
     861            print STDERR "******   Found match!\n";
    847862        }
    848863        $filename = $original_filename;
     
    891906    $newname = &util::rename_file($newname, $self->{'file_rename_method'});
    892907
     908### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n";
    893909    $doc_obj->associate_file($filename, $newname, undef, $section);
    894910
     
    12961312    $self->SUPER::read_file($filename, $encoding, $language, $textref);
    12971313
    1298     # Convert entities to their UTF8 equivalents
     1314    # Convert entities to their Unicode code-point equivalents
    12991315    $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go;
    13001316    $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
Note: See TracChangeset for help on using the changeset viewer.