Changeset 7235


Ignore:
Timestamp:
2004-04-28T13:18:45+12:00 (20 years ago)
Author:
kjdon
Message:

fixed a couple of bugs and added a bit of output to do with extracting titles

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r7202 r7235  
    604604    # one of the tags we want to match.
    605605
     606    # special case for title - we want to remember if its been found
     607    my $found_title = 0;
    606608    # this assumes that ">" won't appear. (I don't think it's allowed to...)
    607609    $html_header =~ /^/; # match the start of the string, for \G assertion
     610   
    608611    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
    609612    my $metatag=$1;
     
    651654        $tag = $find_fields{lc($tag)};
    652655    }
     656    if (lc($tag) eq "title") {
     657        $found_title = 1;
     658    }
    653659    print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
    654660        if ($self->{'verbosity'} > 2);
     
    658664   
    659665    # TITLE: extract the document title
    660     if (exists $find_fields{'title'} && $find_fields{'title'} == 0) {
     666    if (exists $find_fields{'title'} && !$found_title) {
    661667    # we want a title, and didn't find one in the meta tags
    662668    # see if there's a <title> tag
    663669    my $title;
    664     if ($html_header =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
     670    my $from = "";
     671    if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
    665672        $title = $1;
     673        $from = "<title> tags";
    666674    }
    667675    if (!defined $title) {
     676        $from = "first 100 chars";
    668677        # if no title use first 100 or so characters
    669678        $title = $$textref;
     
    685694    $title =~ s/^\s+//s; # in case title_sub introduced any...
    686695    $doc_obj->add_utf8_metadata ($section, 'Title', $title);
    687     print $outhandle " extracted Title metadata \"$title\"\n"
     696    print $outhandle " extracted Title metadata \"$title\" from $from\n"
    688697        if ($self->{'verbosity'} > 2);
    689     }
     698    } 
    690699
    691700    # Special, for metadata names such as tagH1 - extracts
Note: See TracChangeset for help on using the changeset viewer.