Changeset 23363 for main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
- Timestamp:
- 2010-12-01T11:42:27+13:00 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r23352 r23363 186 186 # read in file ($text will be in utf8) 187 187 my $raw_text = ""; 188 $self->read_file_no_decoding 188 $self->read_file_no_decoding($filename_full_path, \$raw_text); 189 189 190 190 my $textref = \$raw_text; … … 192 192 my $closecom = '(?:-->|(?:—|—|--)>)'; 193 193 $$textref =~ s/$opencom(.*?)$closecom//gs; 194 195 # Convert entities to their UTF8 equivalents 196 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; 197 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8 198 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; 194 199 195 200 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; … … 209 214 210 215 # remove quotes from link at start and end if necessary 211 if ($link =~/^\"/) {212 $link =~s/^\"//;213 $link =~s/\"$//;214 } 215 216 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 216 if ($link =~ m/^\"/) { 217 $link =~ s/^\"//; 218 $link =~ s/\"$//; 219 } 220 221 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 217 222 # some links may just be anchor names 218 223 next unless ($link =~ /\S+/); … … 242 247 } 243 248 249 250 my $unicode_url_original_filename = decode("utf8",$url_original_filename); 251 252 ## print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n"; 253 254 # Allow for possibility of raw byte version (UTF8) and Unicode versions of file 244 255 $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 256 $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1; 245 257 } 246 258 } … … 250 262 # filename*, it does not URL decode any filename if a file by the name of the *URL-encoded* 251 263 # string already exists in the local folder. 264 # 265 # Is the following still true?? 252 266 # Return the original filename corresponding to the parameter URL-encoded filename, and 253 267 # a decoded flag that is set to true iff URL-decoding had to be applied. … … 312 326 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'}); 313 327 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 314 # build. so set it manually.315 $doc_obj-> {'source_path'} = $filename_full_path;328 # build. So set it manually. 329 $doc_obj->set_source_path($filename_full_path); 316 330 my $collect_conv_file = &util::filename_within_collection($tidy_filename); 317 331 $doc_obj->set_converted_filename($collect_conv_file); … … 387 401 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 388 402 389 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {390 print STDERR "***!! file = $file\n";391 print STDERR "***!! utf8_file = $utf8_file\n";392 }403 # if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 404 # print STDERR "***!! file = $file\n"; 405 # print STDERR "***!! utf8_file = $utf8_file\n"; 406 # } 393 407 394 408 … … 764 778 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 765 779 766 767 780 # If web page didn't give encoding, then default to utf8 781 my $content_encoding= $self->{'content_encoding'} || "utf8"; 782 768 783 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 769 print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n"; 770 print STDERR "***** looking up $file\n"; 771 } 772 773 my $content_encoding= $self->{'content_encoding'} || "utf8"; 784 print STDERR "*** Encoding with $content_encoding href: $href\n"; 785 } 786 774 787 $href = encode($content_encoding,$href); 775 788 … … 807 820 808 821 $filename = &util::filename_cat($base_dir, $filename); 822 809 823 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) { 810 824 # we are processing a tidytmp file - want paths to be in import … … 827 841 $filename = encode($content_encoding, $opt_decode_utf8_filename); 828 842 829 830 843 # some special processing if the intended filename was converted to utf8, but 831 844 # the actual file still needs to be renamed 832 if (! -e $filename) {845 if (!&util::fd_exists($filename)) { 833 846 # try the original filename stored in map 834 847 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 835 print STDERR "***###!! orig filename did not exist: $filename\n"; 836 } 848 print STDERR "******!! orig filename did not exist: $filename\n"; 849 } 850 851 ## print STDERR "**** trying to look up utf8_filename: $utf8_filename\n"; 837 852 838 853 my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename}; 839 854 840 855 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 841 print STDERR "**** Trying for$original_filename\n";856 print STDERR "****** From lookup utf8_filename, now trying for: $original_filename\n"; 842 857 } 843 858 844 859 if (defined $original_filename && -e $original_filename) { 845 860 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 846 print STDERR "*** found match\n";861 print STDERR "****** Found match!\n"; 847 862 } 848 863 $filename = $original_filename; … … 891 906 $newname = &util::rename_file($newname, $self->{'file_rename_method'}); 892 907 908 ### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n"; 893 909 $doc_obj->associate_file($filename, $newname, undef, $section); 894 910 … … 1296 1312 $self->SUPER::read_file($filename, $encoding, $language, $textref); 1297 1313 1298 # Convert entities to their U TF8equivalents1314 # Convert entities to their Unicode code-point equivalents 1299 1315 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; 1300 1316 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo;
Note:
See TracChangeset
for help on using the changeset viewer.