Changeset 1230
- Timestamp:
- 2000-06-23T11:51:50+12:00 (24 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r1220 r1230 69 69 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n"; 70 70 print STDERR " Defaults to 'Title'.\n"; 71 print STDERR " Use `first200` to get the first 100 characters of the body.\n"; 71 print STDERR " Use `first200` to get the first 200 characters of the body.\n"; 72 print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n"; 72 73 print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n"; 73 74 print STDERR " w3mir \n"; … … 350 351 351 352 foreach my $field (split /,/, $self->{'metadata_fields'}) { 352 353 353 354 # don't need to extract field if it was passed in from a previous 354 355 # (recursive) plugin … … 368 369 } 369 370 370 # special case for Title metadata - try <title> tags 371 # then first 100 characters of text 371 # TITLE: extract the document title 372 372 373 373 if ($field =~ /^title$/i) { … … 379 379 if ($title =~ /\w/) { 380 380 $title =~ s/\s+/ /gs; 381 $title =~ s/^\s+//; 382 $title =~ s/\s+$//; 381 383 $doc_obj->add_utf8_metadata ($section, $field, $title); 382 384 next; … … 389 391 $tmptext =~ s/\s+/ /gs; 390 392 $tmptext =~ s/<[^>]*>//g; 391 my $title = substr ($tmptext, 0, 100); 392 $doc_obj->add_utf8_metadata ($section, $field, $title); 393 } 394 395 # if the user requests the first chars as metadata the extract it 393 $tmptext = substr ($tmptext, 0, 100); 394 $tmptext =~ s/^\s+//; 395 $tmptext =~ s/\s+$//; 396 $tmptext =~ s/\s\S*$/.../; 397 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 398 next; 399 } 400 401 # FIRST200: extract the first 200 characters as metadata 396 402 397 403 if ($field =~ /^first200$/i) { … … 401 407 $tmptext =~ s/<[^>]*>//g; 402 408 $tmptext = substr ($tmptext, 0, 200); 409 $tmptext =~ s/^\s+//; 410 $tmptext =~ s/\s+$//; 403 411 $tmptext =~ s/\s\S*$/.../; 404 412 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 413 next; 414 } 415 416 # H1: extract the text between the first <H1> and </H1> tags 417 if ($field =~ /^H1$/i) { 418 my $tmptext = $$textref; 419 $tmptext =~ s/\s+/ /gs; 420 $tmptext =~ s/.*<H1[^>]*>//i; 421 $tmptext =~ s/<\/H1[^>]*>.*//i; 422 $tmptext =~ s/^\s+//; 423 $tmptext =~ s/\s+$//; 424 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 425 next; 405 426 } 406 427 }
Note:
See TracChangeset
for help on using the changeset viewer.