Changeset 23335 for main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
- Timestamp:
- 2010-11-19T13:29:29+13:00 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r22951 r23335 181 181 my @file_blocks; 182 182 183 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename_full_path); 183 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path); 184 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding; 184 185 185 186 # read in file ($text will be in utf8) … … 229 230 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 230 231 my $utf8_link = ""; 231 $self->decode_text($link,$ encoding,$language,\$utf8_link);232 $self->decode_text($link,$content_encoding,$language,\$utf8_link); 232 233 233 234 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; … … 272 273 273 274 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 275 276 # Lookup content_encoding worked out in file_block pass for this file 277 # Store it under the local name 'content_encoding' so its nice and 278 # easy to access 279 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}; 280 274 281 # get the input file 275 282 my $input_filename = $file; … … 311 318 $self->set_Source_metadata($doc_obj, $filename_no_path); 312 319 } 320 321 delete $self->{'store_content_encoding'}->{$filename_full_path}; 322 $self->{'content_encoding'} = undef; 323 313 324 return ($process_status,$doc_obj); 314 325 } … … 367 378 # links, so even if 'file_is_url' is off, still need to store info 368 379 369 my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$"); 370 my $utf8_file = $self->filename_to_utf8_metadata($file); 371 $utf8_file =~ s/&\#095;/_/g; 380 my ($tailname,$dirname) = &File::Basename::fileparse($file); 381 print STDERR "***!! file = $file\n"; 382 # my $utf8_file = $self->filename_to_utf8_metadata($file); 383 # $utf8_file =~ s/&\#095;/_/g; 384 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 385 print STDERR "***!! utf8_file = $utf8_file\n"; 386 372 387 my $web_url = "http://"; 373 388 if(defined $dirname) { # local directory … … 535 550 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 536 551 } 552 537 553 return 1; 538 554 } … … 737 753 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is; 738 754 739 740 755 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) || 741 756 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 757 758 759 # If web page didn't give encoding, then default to utf8 760 print "*************** looking up $file\n"; 761 762 my $content_encoding= $self->{'content_encoding'} || "utf8"; 763 $href = encode($content_encoding,$href); 764 765 $href = &unicode::raw_filename_to_url_encoded($href); 766 $href = &unicode::filename_to_url($href); 767 742 768 &ghtml::urlsafe ($href); 769 print STDERR "***!!! href=$href\n"; 770 743 771 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; 744 772 } else { 745 # link is to some other type of file (e gimage) so we'll773 # link is to some other type of file (e.g., an image) so we'll 746 774 # need to associate that file 747 775 return $front . $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section) . $back; … … 769 797 $filename =~ s/([\\\/])tidytmp([\\\/])/$1import$2/; 770 798 } 771 # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some 772 # situations. If the *original* file's name was in URL encoding, the following method will not decode it. 799 800 # Replace %XX's in URL with decoded value if required. Note that the 801 # filename may include the %XX in some situations. If the *original* 802 # file's name was in URL encoding, the following method will not decode 803 # it. 773 804 my $utf8_filename = $filename; 774 $filename = $self->opt_url_decode($utf8_filename); 805 my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename); 806 807 my $content_encoding= $self->{'content_encoding'} || "utf8"; 808 809 # The filenames that come through the HTML file have been decoded 810 # into Unicode aware Perl strings. Need to convert them back 811 # to their initial raw-byte encoding to match the file that 812 # exists on the file system 813 $filename = encode($content_encoding, $opt_decode_utf8_filename); 814 775 815 776 816 # some special processing if the intended filename was converted to utf8, but … … 778 818 if (!-e $filename) { 779 819 # try the original filename stored in map 780 my $original_filename = $self->{'utf8_to_original_filename'}->{$filename}; 820 print STDERR "***###!! orig filename did not exist: $filename\n"; 821 822 my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename}; 823 824 print STDERR "**** Trying for $original_filename\n"; 825 781 826 if (defined $original_filename && -e $original_filename) { 827 print STDERR "*** found match\n"; 782 828 $filename = $original_filename; 783 829 } … … 1118 1164 $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); 1119 1165 $title =~ s/^\s+//s; # in case title_sub introduced any... 1120 $doc_obj->add_utf8_metadata ($section, 'Title', $title); 1166 print STDERR "**** adding Title: ", Encode::encode("utf8",$title), "\n"; 1167 $doc_obj->add_utf8_metadata ($section, "Title", $title); 1121 1168 print $outhandle " extracted Title metadata \"$title\" from $from\n" 1122 1169 if ($self->{'verbosity'} > 2);
Note:
See TracChangeset
for help on using the changeset viewer.