Changeset 31440


Ignore:
Timestamp:
2017-02-23T14:49:41+13:00 (5 years ago)
Author:
kjdon
Message:

nearly there for handling russian etc subfolders in import. need to test on windows though. still one part to be worked out.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r31415 r31440  
    186186   
    187187    my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path);
    188     $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding;
    189 
    190     # read in file ($text will be in utf8)
     188    $self->{'store_content_encoding'}->{$filename_full_path} = [$content_encoding, $language];
     189   
     190
     191    # read in file ($text will be in the filesystem encoding)
    191192    my $raw_text = "";
    192193    $self->read_file_no_decoding($filename_full_path, \$raw_text);
     
    296297    &util::block_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname;
    297298
    298             # but only add the linked file to the blocklist if the current html file does not link to itself
     299    # but only add the linked file to the blocklist if the current html file does not link to itself
    299300       
    300301    }
     
    332333    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
    333334
    334     # Lookup content_encoding worked out in file_block pass for this file
    335     # Store it under the local name 'content_encoding' so its nice and
    336     # easy to access
    337     $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};
     335    # Lookup content_encoding and language worked out in file_block pass for this file
     336    # Store them under the local names they are nice and easy to access
     337    $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}[0];
     338    $self->{'language'} = $self->{'store_content_encoding'}->{$filename_full_path}[1];
    338339
    339340    # get the input file
     
    463464    my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname);
    464465    my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname);
    465 
     466   
    466467    my $web_url = "http://";
    467468    my $utf8_web_url = "http://";
     469   
    468470    if(defined $dirname) { # local directory
     471
    469472        # Check for "ftp" in the domain name of the directory
    470473        #  structure to determine if this URL should be a ftp:// URL
     
    481484    $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/"
    482485
    483     $web_url = $web_url.$dirname.$url_encoded_file;
    484     $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file;
     486    # this local directory in import may need to be URL encoded like the file
     487    my $url_encoded_dir = &unicode::raw_filename_to_url_encoded($dirname);
     488    my $utf8_url_encoded_dir =  &unicode::raw_filename_to_utf8_url_encoded($dirname);
     489   
     490    # changed here
     491    $web_url = $web_url.$url_encoded_dir.$url_encoded_file;
     492    $utf8_web_url = $utf8_web_url.$utf8_url_encoded_dir.$utf8_url_encoded_file;
    485493    } else {
    486494    $web_url = $web_url.$url_encoded_file;
     
    493501    print STDERR "*******DEBUG: upgraded_file:       $upgraded_file\n";
    494502    print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n";
     503    print STDERR "*******DEBUG: web url:             $web_url\n";
     504    print STDERR "*******DEBUG: utf8 web url:        $utf8_web_url\n";
    495505    }
    496506
     
    847857    }
    848858
     859    # can't remember adding this :-( must have had a reason though...
     860    if ($link =~ /^\_http/ || $link =~ /^\_libraryname\_/) {
     861    # assume it is a greenstone one and leave alone
     862    return $front . $link . $back;
     863    }
     864
    849865    # attempt to sort out targets - frames are not handled
    850866    # well in this plugin and some cases will screw things
     
    912928        $href = encode($content_encoding,$href);
    913929    }
    914 
    915     $href = &unicode::raw_filename_to_utf8_url_encoded($href); 
     930   
     931    $href = &unicode::raw_filename_to_utf8_url_encoded($href); 
    916932    $href = &unicode::filename_to_url($href);
    917 
     933   
    918934    &ghtml::urlsafe ($href);
    919 
     935   
    920936    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    921937        print STDERR "******DEBUG: href=$href\n";   
    922938    }
    923 
     939   
     940    #TODO here
     941#   if ($rl ==1) {
     942        # have a relative link, we need to do URL encoding etc so it matches what has happened for that file
     943        #$href = &util::rename_file($href, $self->{'file_rename_method'});
     944#       $href = &unicode::raw_filename_to_url_encoded($href);
     945        # then, this might be url encoded, so we replace % with %25
     946#       $href = &unicode::filename_to_url($href);
     947#       print STDERR "DEBUG: url encoded href = $href\n";
     948#   }
    924949
    925950    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
     
    10221047    # exists on the file system
    10231048    $filename = encode($content_encoding, $opt_decode_unicode_filename);
     1049   
    10241050    }
    10251051    elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) {
     
    10511077    # some special processing if the intended filename was converted to utf8, but
    10521078    # the actual file still needs to be renamed
    1053     #if (!&util::fd_exists($filename)) {
    10541079    if (!&FileUtils::fileExists($filename)) {
    10551080    # try the original filename stored in map
     
    10711096        }
    10721097        $filename = $original_filename;
    1073     }
     1098    } 
    10741099    }
    10751100   
     
    11601185    my $self = shift (@_);
    11611186    my ($link, $base_dir, $file) = @_;
    1162 
     1187 
    11631188    # strip off hash part, e.g. #foo, but watch out for any entities, e.g. α
    11641189    my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
     
    11681193        my $outhandle = $self->{'outhandle'};
    11691194        print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
    1170             if $self->{'verbosity'};
     1195            if $self->{'verbosity'};
    11711196        return ($link, "", 0);
    11721197    }
    11731198   
     1199#    my $dirname;
    11741200    if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) {
    1175         my $type = $1;
    1176         my $before_hash_file = $before_hash;
     1201    my $type = $1;
     1202    my $before_hash_file = $before_hash;
    11771203       
    11781204    if ($link =~ m/^(http|ftp):/i) {
     
    11871213    my $before_hash_url = $before_hash_file;
    11881214    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    1189         $before_hash_url =~ s@\\@\/@g;
    1190     }
    1191    
     1215        $before_hash_url =~ s@\\@\/@g;
     1216    }
     1217   
     1218    ######## TODO need to check this for encoding stufff
    11921219    my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file);
    1193    
     1220    print STDERR "chekcing for existence whether relative link or not $linkfilename\n";
    11941221    my $rl = 0;
    11951222    $rl = 1 if (-e $linkfilename);
    1196 
     1223    if (-e $linkfilename) {
     1224       
     1225        print STDERR "DOES exist $linkfilename\n";
     1226    } else {
     1227        print STDERR "DOESN'T exist $linkfilename\n";
     1228    }
    11971229    # make sure there's a slash on the end if it's a directory
    11981230    if ($before_hash_url !~ m/\/$/) {
     
    12031235    } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
    12041236
     1237    #### TODO whst is this test doing???
    12051238    if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
    12061239
     
    12401273        }
    12411274    } else {
    1242         # Turn relative file path into full path
     1275
     1276        # Turn relative file path into full path (inside import dir)
    12431277        my $dirname = &File::Basename::dirname($file);
    1244         $before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash);
     1278
     1279        # we want to add dirname (which is raw filesystem path) onto $before_hash, (which is perl unicode aware string). Convert dirname to perl string
     1280
     1281        my $unicode_dirname ="";
     1282        #my $content_encoding = $self->{'content_encoding'};
     1283        #my $language = $self->{'language'};
     1284
     1285        # actually I think this is wrong. why should we use content encoding?
     1286        #$self->decode_text($dirname, $content_encoding, $language, \$unicode_dirname);
     1287        #my $filename_encoding = $self->{'filename_encoding'};
     1288        # filename_encoding might be auto...
     1289
     1290        # TODO what is the best thing to do here?????
     1291        # try and guess default filesystem encoding, similar to deduce_filename_encoding, but without a file?
     1292        my $filename_encoding = "utf8";
     1293        # copied this from set_Source_metadata in BasePlugin
     1294        if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) {
     1295        # Use filename_encoding to map raw filename to a Perl unicode-aware string
     1296        $unicode_dirname = decode($filename_encoding,$dirname);     
     1297        }
     1298        else {
     1299        # otherwise generate %xx encoded version of filename for char > 127
     1300        $unicode_dirname = &unicode::raw_filename_to_url_encoded($dirname);
     1301        }
     1302 
     1303        $before_hash = &FileUtils::filenameConcatenate($unicode_dirname, $before_hash);
    12451304        $before_hash = $self->eval_dir_dots($before_hash);   
    1246         $before_hash =~ s@\\@/@g; # for windows         
    1247     }
    1248 
    1249     my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);
    1250 
    1251 
    1252 #   print STDERR "**** linkfilename = $linkfilename\n";
    1253 #   if (!&util::fd_exists($linkfilename)) {
    1254 #       print STDERR "***** Warning: Could not find $linkfilename\n";
    1255 #   }
    1256 
     1305        $before_hash =~ s@\\@/@g; # for windows         
     1306    }
     1307
     1308    my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash);   
    12571309
    12581310    # make sure there's a slash on the end if it's a directory
     
    12601312        $before_hash .= "/" if (-d $linkfilename);
    12611313    }
    1262 
    1263 #   print STDERR "*** returning: $before_hash\n";
    1264 
    12651314    return ("http://" . $before_hash, $hash_part, 1);
    12661315    } else {
Note: See TracChangeset for help on using the changeset viewer.