Changeset 31440

Show
Ignore:
Timestamp:
23.02.2017 14:49:41 (3 years ago)
Author:
kjdon
Message:

nearly there for handling russian etc subfolders in import. need to test on windows though. still one part to be worked out.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r31415 r31440  
    186186     
    187187    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path); 
    188     $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding; 
    189  
    190     # read in file ($text will be in utf8) 
     188    $self->{'store_content_encoding'}->{$filename_full_path} = [$content_encoding, $language]; 
     189     
     190 
     191    # read in file ($text will be in the filesystem encoding) 
    191192    my $raw_text = ""; 
    192193    $self->read_file_no_decoding($filename_full_path, \$raw_text); 
     
    296297    &util::block_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname; 
    297298 
    298             # but only add the linked file to the blocklist if the current html file does not link to itself 
     299    # but only add the linked file to the blocklist if the current html file does not link to itself 
    299300         
    300301    } 
     
    332333    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 
    333334 
    334     # Lookup content_encoding worked out in file_block pass for this file 
    335     # Store it under the local name 'content_encoding' so its nice and  
    336     # easy to access 
    337     $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}; 
     335    # Lookup content_encoding and language worked out in file_block pass for this file 
     336    # Store them under the local names they are nice and easy to access 
     337    $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}[0]; 
     338    $self->{'language'} = $self->{'store_content_encoding'}->{$filename_full_path}[1]; 
    338339 
    339340    # get the input file 
     
    463464    my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname); 
    464465    my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname); 
    465  
     466     
    466467    my $web_url = "http://"; 
    467468    my $utf8_web_url = "http://"; 
     469     
    468470    if(defined $dirname) { # local directory 
     471 
    469472        # Check for "ftp" in the domain name of the directory 
    470473        #  structure to determine if this URL should be a ftp:// URL 
     
    481484    $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/" 
    482485 
    483     $web_url = $web_url.$dirname.$url_encoded_file;  
    484     $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;  
     486    # this local directory in import may need to be URL encoded like the file 
     487    my $url_encoded_dir = &unicode::raw_filename_to_url_encoded($dirname); 
     488    my $utf8_url_encoded_dir =  &unicode::raw_filename_to_utf8_url_encoded($dirname); 
     489     
     490    # changed here 
     491    $web_url = $web_url.$url_encoded_dir.$url_encoded_file;  
     492    $utf8_web_url = $utf8_web_url.$utf8_url_encoded_dir.$utf8_url_encoded_file;  
    485493    } else { 
    486494    $web_url = $web_url.$url_encoded_file; 
     
    493501    print STDERR "*******DEBUG: upgraded_file:       $upgraded_file\n"; 
    494502    print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n"; 
     503    print STDERR "*******DEBUG: web url:             $web_url\n"; 
     504    print STDERR "*******DEBUG: utf8 web url:        $utf8_web_url\n"; 
    495505    } 
    496506 
     
    847857    } 
    848858 
     859    # can't remember adding this :-( must have had a reason though... 
     860    if ($link =~ /^\_http/ || $link =~ /^\_libraryname\_/) { 
     861    # assume it is a greenstone one and leave alone 
     862    return $front . $link . $back; 
     863    } 
     864 
    849865    # attempt to sort out targets - frames are not handled  
    850866    # well in this plugin and some cases will screw things 
     
    912928        $href = encode($content_encoding,$href); 
    913929    } 
    914  
    915     $href = &unicode::raw_filename_to_utf8_url_encoded($href);   
     930     
     931    $href = &unicode::raw_filename_to_utf8_url_encoded($href);  
    916932    $href = &unicode::filename_to_url($href); 
    917  
     933     
    918934    &ghtml::urlsafe ($href); 
    919  
     935     
    920936    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    921937        print STDERR "******DEBUG: href=$href\n";     
    922938    } 
    923  
     939     
     940    #TODO here 
     941#   if ($rl ==1) { 
     942        # have a relative link, we need to do URL encoding etc so it matches what has happened for that file 
     943        #$href = &util::rename_file($href, $self->{'file_rename_method'}); 
     944#       $href = &unicode::raw_filename_to_url_encoded($href); 
     945        # then, this might be url encoded, so we replace % with %25 
     946#       $href = &unicode::filename_to_url($href); 
     947#       print STDERR "DEBUG: url encoded href = $href\n"; 
     948#   } 
    924949 
    925950    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; 
     
    10221047    # exists on the file system 
    10231048    $filename = encode($content_encoding, $opt_decode_unicode_filename); 
     1049     
    10241050    } 
    10251051    elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) { 
     
    10511077    # some special processing if the intended filename was converted to utf8, but 
    10521078    # the actual file still needs to be renamed 
    1053     #if (!&util::fd_exists($filename)) { 
    10541079    if (!&FileUtils::fileExists($filename)) { 
    10551080    # try the original filename stored in map 
     
    10711096        } 
    10721097        $filename = $original_filename; 
    1073     } 
     1098    }  
    10741099    } 
    10751100     
     
    11601185    my $self = shift (@_); 
    11611186    my ($link, $base_dir, $file) = @_; 
    1162  
     1187  
    11631188    # strip off hash part, e.g. #foo, but watch out for any entities, e.g. α 
    11641189    my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/; 
     
    11681193        my $outhandle = $self->{'outhandle'}; 
    11691194        print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n" 
    1170             if $self->{'verbosity'}; 
     1195            if $self->{'verbosity'}; 
    11711196        return ($link, "", 0); 
    11721197    } 
    11731198     
     1199#    my $dirname; 
    11741200    if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) { 
    1175         my $type = $1; 
    1176         my $before_hash_file = $before_hash; 
     1201    my $type = $1; 
     1202    my $before_hash_file = $before_hash; 
    11771203         
    11781204    if ($link =~ m/^(http|ftp):/i) { 
     
    11871213    my $before_hash_url = $before_hash_file; 
    11881214    if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
    1189         $before_hash_url =~ s@\\@\/@g; 
    1190     } 
    1191      
     1215        $before_hash_url =~ s@\\@\/@g; 
     1216    } 
     1217     
     1218    ######## TODO need to check this for encoding stufff 
    11921219    my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file); 
    1193      
     1220    print STDERR "chekcing for existence whether relative link or not $linkfilename\n"; 
    11941221    my $rl = 0; 
    11951222    $rl = 1 if (-e $linkfilename); 
    1196  
     1223    if (-e $linkfilename) { 
     1224         
     1225        print STDERR "DOES exist $linkfilename\n"; 
     1226    } else { 
     1227        print STDERR "DOESN'T exist $linkfilename\n"; 
     1228    } 
    11971229    # make sure there's a slash on the end if it's a directory 
    11981230    if ($before_hash_url !~ m/\/$/) { 
     
    12031235    } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) { 
    12041236 
     1237    #### TODO whst is this test doing??? 
    12051238    if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) { 
    12061239 
     
    12401273        } 
    12411274    } else { 
    1242         # Turn relative file path into full path 
     1275 
     1276        # Turn relative file path into full path (inside import dir) 
    12431277        my $dirname = &File::Basename::dirname($file); 
    1244         $before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash); 
     1278 
     1279        # we want to add dirname (which is raw filesystem path) onto $before_hash, (which is perl unicode aware string). Convert dirname to perl string 
     1280 
     1281        my $unicode_dirname =""; 
     1282        #my $content_encoding = $self->{'content_encoding'}; 
     1283        #my $language = $self->{'language'}; 
     1284 
     1285        # actually I think this is wrong. why should we use content encoding? 
     1286        #$self->decode_text($dirname, $content_encoding, $language, \$unicode_dirname); 
     1287        #my $filename_encoding = $self->{'filename_encoding'}; 
     1288        # filename_encoding might be auto... 
     1289 
     1290        # TODO what is the best thing to do here????? 
     1291        # try and guess default filesystem encoding, similar to deduce_filename_encoding, but without a file? 
     1292        my $filename_encoding = "utf8"; 
     1293        # copied this from set_Source_metadata in BasePlugin 
     1294        if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) { 
     1295        # Use filename_encoding to map raw filename to a Perl unicode-aware string  
     1296        $unicode_dirname = decode($filename_encoding,$dirname);      
     1297        } 
     1298        else { 
     1299        # otherwise generate %xx encoded version of filename for char > 127 
     1300        $unicode_dirname = &unicode::raw_filename_to_url_encoded($dirname); 
     1301        } 
     1302  
     1303        $before_hash = &FileUtils::filenameConcatenate($unicode_dirname, $before_hash); 
    12451304        $before_hash = $self->eval_dir_dots($before_hash);    
    1246         $before_hash =~ s@\\@/@g; # for windows          
    1247     } 
    1248  
    1249     my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);  
    1250  
    1251  
    1252 #   print STDERR "**** linkfilename = $linkfilename\n"; 
    1253 #   if (!&util::fd_exists($linkfilename)) { 
    1254 #       print STDERR "***** Warning: Could not find $linkfilename\n"; 
    1255 #   } 
    1256  
     1305        $before_hash =~ s@\\@/@g; # for windows          
     1306    } 
     1307 
     1308    my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);     
    12571309 
    12581310    # make sure there's a slash on the end if it's a directory 
     
    12601312        $before_hash .= "/" if (-d $linkfilename); 
    12611313    } 
    1262  
    1263 #   print STDERR "*** returning: $before_hash\n"; 
    1264  
    12651314    return ("http://" . $before_hash, $hash_part, 1); 
    12661315    } else {