Changeset 1190


Ignore:
Timestamp:
2000-05-25T09:27:45+12:00 (24 years ago)
Author:
gwp
Message:

The first 200 chars of body text can now be extracted as metadata
by adding 'first200' to the -metadata_fields argument. A potential
problem extracting the title was resolved.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r1020 r1190  
    6868    print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
    6969    print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
    70     print STDERR "                          Defaults to 'Title'\n";
     70    print STDERR "                          Defaults to 'Title'.\n";
     71    print STDERR "                          Use `first200` to get the first 100 characters of the body.\n";
    7172    print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
    7273    print STDERR "                          w3mir \n";
     
    386387        # if no title use first 100 characters
    387388        my $tmptext = $$textref;
     389        $tmptext =~ s/\s+/ /gs;
    388390        $tmptext =~ s/<[^>]*>//g;
    389391        my $title = substr ($tmptext, 0, 100);
    390         $title =~ s/\s+/ /gs;
    391392        $doc_obj->add_metadata ($section, $field, $title);
    392393    }
    393     }
    394 }
     394
     395    # if the user requests the first chars as metadata the extract it
     396
     397    if ($field =~ /^first200$/i) {
     398        my $tmptext = $$textref;
     399        $tmptext =~ s/\s+/ /gs;
     400        $tmptext =~ s/.*<body[^>]*>//i;
     401        $tmptext =~ s/<[^>]*>//g;
     402        $tmptext = substr ($tmptext, 0, 200);
     403        $tmptext =~ s/\s\S*$/.../;
     404        $doc_obj->add_metadata ($section, $field, $tmptext);
     405    }
     406    }
     407}
     408
    395409
    396410# evaluate any "../" to next directory up
Note: See TracChangeset for help on using the changeset viewer.