Changeset 23371 for main/trunk
- Timestamp:
- 2010-12-02T00:36:37+13:00 (13 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/basebuildproc.pm
r23182 r23371 516 516 517 517 # special case for URL metadata 518 if ($field =~ /^URL$/i) { 519 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] }); 518 if ($field =~ m/^URL$/i) { 519 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, 520 $value, { 'section' => [ $section_OID ] }); 520 521 } 521 522 -
main/trunk/greenstone2/perllib/ghtml.pm
r23362 r23371 65 65 sub urlsafe 66 66 { 67 # protect any hash's that are part of an entity, e.g. a 68 $_[0] =~ s/&#(.*?);/&%23$1;/g; 69 70 # and the usual suspects 67 71 $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse; 68 72 } -
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r23363 r23371 219 219 } 220 220 221 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 221 # remove any anchor names, e.g. foo.html#name becomes foo.html 222 # but watch out for any #'s that are part of entities, such as α 223 $link =~ s/([^&])\#.*$/$1/s; 224 222 225 # some links may just be anchor names 223 226 next unless ($link =~ /\S+/); … … 350 353 if ($ENV{'GSDLOS'} =~ m/^windows/i) { 351 354 # this makes life so much easier... perl can cope with unix-style '/'s. 352 $base_dir =~ s@(\\)+@/@g; 353 $file =~ s@(\\)+@/@g; 354 } 355 $base_dir =~ s@(\\)+@/@g; 356 $file =~ s@(\\)+@/@g; 357 } 358 359 my $filename = &util::filename_cat($base_dir,$file); 360 my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir); 361 my $upgraded_filename = &util::upgrade_if_dos_filename($filename); 362 363 if ($ENV{'GSDLOS'} =~ m/^windows/i) { 364 # And again 365 $upgraded_base_dir =~ s@(\\)+@/@g; 366 $upgraded_filename =~ s@(\\)+@/@g; 367 368 # Need to make sure there is a '/' on the end of upgraded_base_dir 369 if ($upgraded_base_dir !~ m/\/$/) { 370 $upgraded_base_dir .= "/"; 371 } 372 } 373 my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir); 355 374 356 375 # reset per-doc stuff... … … 371 390 372 391 my $arrSections = []; 373 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $ file)/isge;392 $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge; 374 393 375 394 if (scalar(@$arrSections)) { … … 395 414 # links, so even if 'file_is_url' is off, still need to store info 396 415 397 my ($tailname,$dirname) = &File::Basename::fileparse($ file);416 my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file); 398 417 399 418 # my $utf8_file = $self->filename_to_utf8_metadata($file); 400 419 # $utf8_file =~ s/&\#095;/_/g; 401 420 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 402 403 # if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {404 # print STDERR "***!! file = $file\n";405 # print STDERR "***!! utf8_file = $utf8_file\n";406 # }407 421 408 422 … … 426 440 } 427 441 $web_url =~ s/\\/\//g; 442 443 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 444 print STDERR "******* upgraded_file: $upgraded_file\n"; 445 print STDERR "******* adding URL metadata: $utf8_file\n"; 446 } 447 448 428 449 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 429 450 … … 463 484 # doesn't necessarily mean there are Section tags in 464 485 # the document 465 $self->process_section(\$text, $ base_dir, $file, $doc_obj, $cursection);486 $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 466 487 } 467 488 while ($comment =~ s/$lt(.*?)$gt//s) { … … 509 530 } 510 531 if ($cursection ne "") { 511 print $outhandle "HTMLPlugin: WARNING: $ file contains unmatched <Section></Section> tags\n";532 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n"; 512 533 } 513 534 … … 517 538 if (!$found_something) { 518 539 if ($self->{'verbosity'} > 2) { 519 print $outhandle "HTMLPlugin: WARNING: $ file appears to contain no Section tags so\n";540 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n"; 520 541 print $outhandle " will be processed as a single section document\n"; 521 542 } 522 543 523 544 # go ahead and process single-section document 524 $self->process_section($textref, $ base_dir, $file, $doc_obj, $cursection);545 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 525 546 526 547 # if document contains no Section tags we'll go ahead … … 532 553 533 554 } else { 534 print $outhandle "HTMLPlugin: WARNING: $ file contains the following text outside\n";555 print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n"; 535 556 print $outhandle " of the final closing </Section> tag. This text will\n"; 536 557 print $outhandle " be ignored."; … … 552 573 # been processed already but we should print the warning 553 574 # as above and extract metadata 554 print $outhandle "HTMLPlugin: WARNING: $ file appears to contain no Section tags and\n";575 print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n"; 555 576 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 556 577 } … … 570 591 571 592 # single section document 572 $self->process_section($textref, $ base_dir, $file, $doc_obj, $cursection);593 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 573 594 } 574 595 … … 778 799 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 779 800 780 # If web page didn't give encoding, then default to utf8 781 my $content_encoding= $self->{'content_encoding'} || "utf8"; 782 783 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 784 print STDERR "*** Encoding with $content_encoding href: $href\n"; 785 } 786 787 $href = encode($content_encoding,$href); 801 if ($ENV{'GSDLOS'} =~ m/^windows$/) { 802 803 # Don't do any encoding for now, as not clear what 804 # the right thing to do is to support filename 805 # encoding on Windows when they are not UTF16 806 # 807 } 808 else { 809 # => Unix-based system 810 811 # If web page didn't give encoding, then default to utf8 812 my $content_encoding= $self->{'content_encoding'} || "utf8"; 813 814 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 815 print STDERR "**** Encoding with '$content_encoding', href: $href\n"; 816 } 817 818 $href = encode($content_encoding,$href); 819 } 788 820 789 821 $href = &unicode::raw_filename_to_url_encoded($href); … … 791 823 792 824 &ghtml::urlsafe ($href); 825 793 826 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 794 print STDERR "*** !!!href=$href\n";827 print STDERR "****** href=$href\n"; 795 828 } 796 829 … … 922 955 my ($link, $base_dir, $file) = @_; 923 956 924 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/; 957 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. α 958 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/; 925 959 926 960 $hash_part = "" if !defined $hash_part; -
main/trunk/greenstone2/perllib/unicode.pm
r23362 r23371 780 780 my $str_out = $str_in; 781 781 782 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig; 782 783 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig; 783 784 $str_out =~ s/&#([0-9]+);/chr($1)/eig; 784 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;785 785 786 786 return $str_out; -
main/trunk/greenstone2/perllib/util.pm
r23362 r23371 1030 1030 my ($filename,$within_dir) = @_; 1031 1031 1032 my $dirsep = &util::get_dirsep();1033 if ($within_dir !~ m/$dirsep$/) { 1032 if ($within_dir !~ m/[\/\\]$/) { 1033 my $dirsep = &util::get_dirsep(); 1034 1034 $within_dir .= $dirsep; 1035 1035 } 1036 1036 1037 1037 $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator 1038 1038 1039 1039 if ($filename =~ m/^$within_dir(.*)$/) { 1040 1040 $filename = $1; … … 1093 1093 sub upgrade_if_dos_filename 1094 1094 { 1095 my ($filename_full_path ) = @_;1095 my ($filename_full_path,$and_encode) = @_; 1096 1096 1097 1097 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { … … 1102 1102 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone) 1103 1103 $filename_full_path =~ s/^(.)/\l$1/; 1104 if ((defined $and_encode) && ($and_encode)) { 1105 $filename_full_path = encode("utf8",$filename_full_path); 1106 } 1104 1107 } 1105 1108
Note:
See TracChangeset
for help on using the changeset viewer.