Changeset 8509
- Timestamp:
- 2004-11-11T13:53:21+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r8366 r8509 42 42 use util; 43 43 use parsargv; 44 use XMLParser; 44 45 45 46 sub BEGIN { … … 135 136 $self->{'dir_num'} = 0; 136 137 $self->{'file_num'} = 0; 138 137 139 return bless $self, $class; 138 140 } … … 143 145 my $self = shift (@_); 144 146 145 return q^(?i)\.(gif|jpe?g|jpe| png|css|js)$^;147 return q^(?i)\.(gif|jpe?g|jpe|jpg|png|css)$^; 146 148 } 147 149 … … 153 155 } 154 156 155 157 sub metadata_read { 158 my $self = shift (@_); 159 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_; 160 161 my $outhandle = $self->{'outhandle'}; 162 163 my $filename = $file; 164 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/; 165 166 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/; 167 168 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 169 return undef; # can't recognise 170 } 171 172 # Do encoding stuff 173 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 174 175 # read in file ($text will be in utf8) 176 my $text = ""; 177 $self->read_file ($filename, $encoding, $language, \$text); 178 179 $self->store_block_files (\$text, $filename); 180 181 return 1; 182 } 183 184 sub store_block_files 185 { 186 my $self =shift (@_); 187 my ($textref, $filename) = @_; 188 189 my $html_fname = $filename; 190 my @file_blocks; 191 192 my $opencom = '(?:<!--|<!(?:—|—|--))'; 193 my $closecom = '(?:-->|(?:—|—|--)>)'; 194 $$textref =~ s/$opencom(.*?)$closecom//gs; 195 196 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; 197 my @img_matches = ($$textref =~ m/<img[^>]*?src\s*=\s*($attval)[^>]*>/igs); 198 my @usemap_matches = ($$textref =~ m/<img[^>]*?usemap\s*=\s*($attval)[^>]*>/igs); 199 my @link_matches = ($$textref =~ m/<link[^>]*?href\s*=\s*($attval)[^>]*>/igs); 200 201 202 foreach my $link (@img_matches, @usemap_matches, @link_matches) { 203 204 # remove quotes from link at start and end if necessary 205 if ($link=~/^\"/) { 206 $link=~s/^\"//; 207 $link=~s/\"$//; 208 } 209 210 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 211 212 if ($link !~ s@^/@@ && $link !~ /^([A-Z]:?)\\/) { 213 # Turn relative file path into full path 214 my $dirname = &File::Basename::dirname($filename); 215 $link = &util::filename_cat($dirname, $link); 216 } 217 $link = $self->eval_dir_dots($link); 218 $self->{'file_blocks'}->{$link} = 1; 219 } 220 } 221 156 222 # do plugin specific processing of doc_obj 157 223 sub process { … … 178 244 my $cursection = $doc_obj->get_top_section(); 179 245 180 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 246 $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection) 181 247 unless $self->{'no_metadata'} || $self->{'description_tags'}; 182 248 … … 189 255 190 256 if ($self->{'description_tags'}) { 191 192 257 # remove the html header - note that doing this here means any 193 258 # sections defined within the header will be lost (so all <Section> 194 259 # tags must appear within the body of the HTML) 260 my ($head_keep) = ($$textref =~ m/^(.*?)<body[^>]*>/is); 261 195 262 $$textref =~ s/^.*?<body[^>]*>//is; 196 263 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; … … 198 265 my $opencom = '(?:<!--|<!(?:—|—|--))'; 199 266 my $closecom = '(?:-->|(?:—|—|--)>)'; 267 200 268 my $lt = '(?:<|<)'; 201 269 my $gt = '(?:>|>)'; … … 213 281 } 214 282 while ($comment =~ s/$lt(.*?)$gt//s) { 215 216 283 my $tag = $1; 217 284 if ($tag eq "Section") { … … 253 320 if ($$textref =~ /\S/) { 254 321 if (!$found_something) { 255 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; 256 print $outhandle " will be processed as a single section document\n"; 257 322 if ($self->{'verbosity'} > 2) { 323 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; 324 print $outhandle " will be processed as a single section document\n"; 325 } 326 258 327 # go ahead and process single-section document 259 328 $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); … … 262 331 # and extract metadata (this won't have been done 263 332 # above as the -description_tags option prevents it) 264 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection) 333 my $complete_text = $head_keep.$doc_obj->get_text($cursection); 334 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection) 265 335 unless $self->{'no_metadata'}; 266 336 … … 269 339 print $outhandle " of the final closing </Section> tag. This text will\n"; 270 340 print $outhandle " be ignored."; 341 271 342 my ($text); 272 343 if (length($$textref) > 30) { … … 280 351 } elsif (!$found_something) { 281 352 282 # may get to here if document contained no valid Section 283 # tags but did contain some comments. The text will have 284 # been processed already but we should print the warning 285 # as above and extract metadata 286 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags so\n"; 287 print $outhandle " will be processed as a single section document\n"; 288 289 $self->extract_metadata (\$doc_obj->get_text($cursection), $metadata, $doc_obj, $cursection) 353 if ($self->{'verbosity'} > 2) { 354 # may get to here if document contained no valid Section 355 # tags but did contain some comments. The text will have 356 # been processed already but we should print the warning 357 # as above and extract metadata 358 print $outhandle "HTMLPlug: WARNING: $file appears to contain no Section tags and\n"; 359 print $outhandle " is blank or empty. Metadata will be assigned if present.\n"; 360 } 361 362 my $complete_text = $head_keep.$doc_obj->get_text($cursection); 363 $self->extract_metadata (\$complete_text, $metadata, $doc_obj, $cursection) 290 364 unless $self->{'no_metadata'}; 291 365 } … … 339 413 my ($front, $link, $back, $base_dir, 340 414 $file, $doc_obj, $section) = @_; 415 341 416 # remove quotes from link at start and end if necessary 342 417 if ($link=~/^\"/) { … … 357 432 358 433 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file); 359 434 360 435 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section); 361 436 my $anchor_name = $img_file; … … 387 462 388 463 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/; 389 464 390 465 ##### leave all these links alone (they won't be picked up by intermediate 391 466 ##### pages). I think that's safest when dealing with frames, targets etc. … … 404 479 &ghtml::urlsafe ($href); 405 480 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; 406 407 481 } else { 408 482 # link is to some other type of file (eg image) so we'll … … 434 508 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part; 435 509 } 436 437 510 if ($self->{'rename_assoc_files'}) { 438 511 if (defined $self->{'aux_files'}->{$href}) { … … 446 519 $doc_obj->associate_file($filename, $newname, undef, $section); 447 520 return "_httpdocimg_/$newname"; 448 449 521 } else { 450 522 ($newname) = $filename =~ /([^\/\\]*)$/; … … 460 532 461 533 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/; 462 534 463 535 $hash_part = "" if !defined $hash_part; 464 536 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) { … … 468 540 return ($link, "", 0); 469 541 } 470 542 471 543 if ($before_hash =~ s@^((?:http|ftp|file)://)@@i) { 472 544 my $type = $1; … … 481 553 482 554 my $linkfilename = &util::filename_cat ($base_dir, $before_hash); 483 555 484 556 my $rl = 0; 485 557 $rl = 1 if (-e $linkfilename); … … 519 591 $before_hash=$win_before_hash; 520 592 } 521 522 593 } 523 594 else { … … 525 596 $before_hash =~ s@^$base_dir/@@; 526 597 } 527 528 598 } 529 530 599 } else { 531 600 # Turn relative file path into full path … … 536 605 537 606 my $linkfilename = &util::filename_cat ($base_dir, $before_hash); 538 539 607 # make sure there's a slash on the end if it's a directory 540 608 if ($before_hash !~ /\/$/) { … … 543 611 544 612 return ("http://" . $before_hash, $hash_part, 1); 545 546 613 } else { 547 614 # mailto, news, nntp, telnet, javascript or gopher link … … 611 678 } 612 679 680 613 681 # find the header in the html file, which has the meta tags 614 682 $$textref =~ m@<head>(.*?)</head>@si; 615 683 616 684 my $html_header=$1; 617 618 685 # go through every <meta... tag defined in the html and see if it is 619 686 # one of the tags we want to match. … … 623 690 # this assumes that ">" won't appear. (I don't think it's allowed to...) 624 691 $html_header =~ /^/; # match the start of the string, for \G assertion 625 692 626 693 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) { 627 694 my $metatag=$1; … … 688 755 $from = "<title> tags"; 689 756 } 757 690 758 if (!defined $title) { 691 759 $from = "first 100 chars"; … … 748 816 my $self = shift (@_); 749 817 my ($filename) = @_; 750 751 818 my $dirsep_os = &util::get_os_dirsep(); 752 819 my @dirsep = split(/$dirsep_os/,$filename); … … 756 823 if ($d eq "..") { 757 824 pop(@eval_dirs); 758 825 759 826 } elsif ($d eq ".") { 760 827 # do nothing! … … 765 832 } 766 833 834 # Need to fiddle with number of elements in @eval_dirs if the 835 # first one is the empty string. This is because of a 836 # modification to util::filename_cat that supresses the addition 837 # of a leading '/' character (or \ if windows) (intended to help 838 # filename cat with relative paths) if the first entry in the 839 # array is the empty string. Making the array start with *two* 840 # empty strings is a way to defeat this "smart" option. 841 # 842 if (scalar(@eval_dirs) > 0) { 843 if ($eval_dirs[0] eq ""){ 844 unshift(@eval_dirs,""); 845 } 846 } 767 847 return &util::filename_cat(@eval_dirs); 768 848 }
Note:
See TracChangeset
for help on using the changeset viewer.