Changeset 7235
- Timestamp:
- 2004-04-28T13:18:45+12:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r7202 r7235 604 604 # one of the tags we want to match. 605 605 606 # special case for title - we want to remember if its been found 607 my $found_title = 0; 606 608 # this assumes that ">" won't appear. (I don't think it's allowed to...) 607 609 $html_header =~ /^/; # match the start of the string, for \G assertion 610 608 611 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) { 609 612 my $metatag=$1; … … 651 654 $tag = $find_fields{lc($tag)}; 652 655 } 656 if (lc($tag) eq "title") { 657 $found_title = 1; 658 } 653 659 print $outhandle " extracted \"$tag\" metadata \"$value\"\n" 654 660 if ($self->{'verbosity'} > 2); … … 658 664 659 665 # TITLE: extract the document title 660 if (exists $find_fields{'title'} && $find_fields{'title'} == 0) {666 if (exists $find_fields{'title'} && !$found_title) { 661 667 # we want a title, and didn't find one in the meta tags 662 668 # see if there's a <title> tag 663 669 my $title; 664 if ($html_header =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) { 670 my $from = ""; 671 if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) { 665 672 $title = $1; 673 $from = "<title> tags"; 666 674 } 667 675 if (!defined $title) { 676 $from = "first 100 chars"; 668 677 # if no title use first 100 or so characters 669 678 $title = $$textref; … … 685 694 $title =~ s/^\s+//s; # in case title_sub introduced any... 686 695 $doc_obj->add_utf8_metadata ($section, 'Title', $title); 687 print $outhandle " extracted Title metadata \"$title\" \n"696 print $outhandle " extracted Title metadata \"$title\" from $from\n" 688 697 if ($self->{'verbosity'} > 2); 689 } 698 } 690 699 691 700 # Special, for metadata names such as tagH1 - extracts
Note:
See TracChangeset
for help on using the changeset viewer.