Changeset 1230


Ignore:
Timestamp:
2000-06-23T11:51:50+12:00 (24 years ago)
Author:
gwp
Message:

Added an additional H1 metadata field that extracts the text
between the the first <h1> and </H1> tags. Tidied up the
other metadata fields a little.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r1220 r1230  
    6969    print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
    7070    print STDERR "                          Defaults to 'Title'.\n";
    71     print STDERR "                          Use `first200` to get the first 100 characters of the body.\n";
     71    print STDERR "                          Use `first200` to get the first 200 characters of the body.\n";
     72    print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
    7273    print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
    7374    print STDERR "                          w3mir \n";
     
    350351
    351352    foreach my $field (split /,/, $self->{'metadata_fields'}) {
    352    
     353
    353354    # don't need to extract field if it was passed in from a previous
    354355    # (recursive) plugin
     
    368369    }
    369370   
    370     # special case for Title metadata - try <title> tags
    371     # then first 100 characters of text
     371    # TITLE: extract the document title
    372372   
    373373    if ($field =~ /^title$/i) {
     
    379379            if ($title =~ /\w/) {
    380380            $title =~ s/\s+/ /gs;
     381            $title =~ s/^\s+//;
     382            $title =~ s/\s+$//;
    381383            $doc_obj->add_utf8_metadata ($section, $field, $title);
    382384            next;
     
    389391        $tmptext =~ s/\s+/ /gs;
    390392        $tmptext =~ s/<[^>]*>//g;
    391         my $title = substr ($tmptext, 0, 100);
    392         $doc_obj->add_utf8_metadata ($section, $field, $title);
    393     }
    394 
    395     # if the user requests the first chars as metadata the extract it
     393        $tmptext = substr ($tmptext, 0, 100);
     394        $tmptext =~ s/^\s+//;
     395        $tmptext =~ s/\s+$//;
     396        $tmptext =~ s/\s\S*$/.../;
     397        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     398        next;
     399    }
     400
     401    # FIRST200: extract the first 200 characters as metadata
    396402
    397403    if ($field =~ /^first200$/i) {
     
    401407        $tmptext =~ s/<[^>]*>//g;
    402408        $tmptext = substr ($tmptext, 0, 200);
     409        $tmptext =~ s/^\s+//;
     410        $tmptext =~ s/\s+$//;
    403411        $tmptext =~ s/\s\S*$/.../;
    404412        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     413        next;
     414    }
     415
     416    # H1: extract the text between the first <H1> and </H1> tags
     417    if ($field =~ /^H1$/i) {
     418        my $tmptext = $$textref;
     419        $tmptext =~ s/\s+/ /gs;
     420        $tmptext =~ s/.*<H1[^>]*>//i;
     421        $tmptext =~ s/<\/H1[^>]*>.*//i;
     422        $tmptext =~ s/^\s+//;
     423        $tmptext =~ s/\s+$//;
     424        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     425        next;
    405426    }
    406427    }
Note: See TracChangeset for help on using the changeset viewer.