Changeset 1602
- Timestamp:
- 2000-10-14T20:38:53+13:00 (24 years ago)
- Location:
- trunk/gsdl/perllib/plugins
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r1424 r1602 69 69 print STDERR " -markup_acronyms Added acronym metadata into document text\n\n"; 70 70 print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n"; 71 print STDERR " -first Comma seperated list of first sizes to extract from the text \n"; 72 print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n"; 73 print STDERR " Defualts to '-first 200'. '-first 1000' also useful.\n"; 74 print STDERR " -extract_email Extract email addresses as metadata\n\n"; 71 75 } 72 76 … … 93 97 q^block_exp/.*/^, \$self->{'block_exp'}, 94 98 q^extract_acronyms^, \$self->{'extract_acronyms'}, 99 q^extract_email^, \$self->{'extract_email'}, 95 100 q^markup_acronyms^, \$self->{'markup_acronyms'}, 96 101 q^extract_language^, \$self->{'extract_language'}, 102 q^first/.*/200^, \$self->{'first'}, 97 103 q^date_extract^, \$self->{'date_extract'}, 98 104 "maximum_date/\\d{4}/$year", \$self->{'max_year'}, … … 316 322 } 317 323 318 # extract acronyms (and hopefully other stuff soon too). 324 # FIRSTNNN: extract the first NNN characters as metadata 325 sub extract_first_NNNN_characters { 326 my $self = shift (@_); 327 my ($textref, $doc_obj, $thissection) = @_; 328 329 foreach my $size (split /,/, $self->{'first'}) { 330 my $tmptext = $$textref; 331 $tmptext =~ s/^\s+//; 332 $tmptext =~ s/\s+$//; 333 $tmptext =~ s/\s+/ /gs; 334 $tmptext = substr ($tmptext, 0, $size); 335 $tmptext =~ s/\s\S*$/…/; 336 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); 337 } 338 } 339 340 sub extract_email { 341 my $self = shift (@_); 342 my ($textref, $doc_obj, $thissection) = @_; 343 my $outhandle = $self->{'outhandle'}; 344 345 print $outhandle " extracting email addresses ...\n" 346 if ($self->{'verbosity'} >= 2); 347 348 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g); 349 @email = sort @email; 350 351 my @email2 = (); 352 foreach my $address (@email) { 353 if (!(join(" ",@email2) =~ m/$address/ )) { 354 push @email2, $address; 355 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); 356 print $outhandle " extracting $address\n" 357 if ($self->{'verbosity'} >= 3); 358 } 359 } 360 print $outhandle " done extracting email addresses.\n" 361 if ($self->{'verbosity'} >= 2); 362 363 } 364 365 # extract metadata 319 366 sub auto_extract_metadata { 320 367 my $self = shift (@_); 321 368 my ($doc_obj) = @_; 322 369 370 if ($self->{'extract_email'}) { 371 my $thissection = $doc_obj->get_top_section(); 372 while (defined $thissection) { 373 my $text = $doc_obj->get_text($thissection); 374 $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./; 375 $thissection = $doc_obj->get_next_section ($thissection); 376 } 377 } 378 if ($self->{'first'}) { 379 my $thissection = $doc_obj->get_top_section(); 380 while (defined $thissection) { 381 my $text = $doc_obj->get_text($thissection); 382 $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./; 383 $thissection = $doc_obj->get_next_section ($thissection); 384 } 385 } 386 323 387 if ($self->{'extract_acronyms'}) { 324 388 my $thissection = $doc_obj->get_top_section(); … … 329 393 } 330 394 } 331 395 332 396 if ($self->{'markup_acronyms'}) { 333 397 my $thissection = $doc_obj->get_top_section(); … … 411 475 my $previous_data = $doc_obj->get_metadata($thissection, "Acronym"); 412 476 foreach my $thisAcro (@$previous_data) { 413 if ($thisAcro eq $acro->to_string()) 414 { 477 if ($thisAcro eq $acro->to_string()) { 415 478 $seen_before = "true"; 416 479 print $outhandle " already seen ". $acro->to_string() . "\n" 417 if ($self->{'verbosity'} >= 2);480 if ($self->{'verbosity'} >= 4); 418 481 } 419 482 } 420 483 421 if ($seen_before eq "false") 422 { 484 if ($seen_before eq "false") { 423 485 #write it to the file ... 424 486 $acro->write_to_file(); … … 426 488 #do the normal acronym 427 489 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string()); 428 429 if ($self->{'verbosity'} >= 1);490 print $outhandle " adding ". $acro->to_string() . "\n" 491 if ($self->{'verbosity'} >= 3); 430 492 431 # # do the KWIC (Key Word In Context) acronym432 # my @kwic = $acro->to_string_kwic();433 # foreach my $kwic (@kwic) {434 # $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC", $kwic);435 # print STDERR " adding ". $kwic . "\n"436 # if ($self->{'verbosity'} >= 2);437 # }438 493 } 439 494 } … … 460 515 461 516 1; 517 518 519 -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r1448 r1602 57 57 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to extract.\n"; 58 58 print STDERR " Defaults to 'Title'.\n"; 59 print STDERR " Use `first200` to get the first 200 characters of the body.\n"; 60 print STDERR " Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n"; 59 print STDERR " Use 'tag<tagname>' to have the contents of the first <tagname>\n"; 60 print STDERR " pair put in a metadata element called 'tagname' Capitalise \n"; 61 print STDERR " 'tagname' as you want the metadata capitalised in the GML \n"; 62 print STDERR " file, since the tag extraction is case insensitive.\n"; 63 print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and place it \n"; 64 print STDERR " in the 'Creator' field. Requires the -metadata_fields flag.\n "; 61 65 print STDERR " -w3mir Set if w3mir was used to generate input file structure.\n"; 62 66 print STDERR " -assoc_files Perl regular expression of file extensions to associate with\n"; … … 79 83 q^no_metadata^, \$self->{'no_metadata'}, 80 84 q^metadata_fields/.*/Title^, \$self->{'metadata_fields'}, 85 q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'}, 81 86 q^w3mir^, \$self->{'w3mir'}, 82 87 q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'}, … … 204 209 ##### possible - the following line should probably be deleted if that can be done 205 210 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is; 206 207 211 208 212 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) || … … 334 338 } 335 339 } 340 sub extract_first_NNNN_characters { 341 my $self = shift (@_); 342 my ($textref, $doc_obj, $thissection) = @_; 343 344 foreach my $size (split /,/, $self->{'first'}) { 345 my $tmptext = $$textref; 346 $tmptext =~ s/.*<body[^>]*>//i; 347 $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'}); 348 $tmptext =~ s/<[^>]*>/ /g; 349 $tmptext =~ s/ / /g; 350 $tmptext =~ s/^\s+//; 351 $tmptext =~ s/\s+$//; 352 $tmptext =~ s/\s+/ /gs; 353 $tmptext = substr ($tmptext, 0, $size); 354 $tmptext =~ s/\s\S*$/…/; 355 $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext); 356 } 357 } 336 358 337 359 sub extract_metadata { … … 339 361 my ($textref, $metadata, $doc_obj, $section) = @_; 340 362 363 # if we don't want metadata, we may as well not be here ... 341 364 return if (!defined $self->{'metadata_fields'}); 365 366 # hunt for an author 367 if (defined $self->{'hunt_creator_metadata'}) { 368 for my $name (split /,/, "AUTHOR,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") { 369 if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) { 370 my $content = $1 . $2; 371 if ($content =~ /content\s*=\s*\"?(.*)\"?/is) { 372 if (defined $1) { 373 my $value = $1; 374 $value =~ s/\"$//; 375 $value =~ s/\s+/ /gs; 376 print "adding Creator of $value\n"; 377 $doc_obj->add_utf8_metadata($section, "Creator", $value); 378 } 379 } 380 } 381 } 382 } 342 383 343 384 foreach my $field (split /,/, $self->{'metadata_fields'}) { … … 355 396 $value =~ s/\"$//; 356 397 $value =~ s/\s+/ /gs; 398 $value =~ s/\".*//gs; 357 399 $doc_obj->add_utf8_metadata($section, $field, $value); 358 400 next; … … 361 403 } 362 404 363 # TITLE: extract the document title 364 405 # TITLE: extract the document title 365 406 if ($field =~ /^title$/i) { 366 367 407 # see if there's a <title> tag 368 408 if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) { … … 381 421 # if no title use first 100 characters 382 422 my $tmptext = $$textref; 383 $tmptext =~ s/\s+/ /gs;384 423 $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'}); 385 $tmptext =~ s/<[^>]*>//g; 386 $tmptext = substr ($tmptext, 0, 100); 424 $tmptext =~ s/<[^>]*>/ /g; 387 425 $tmptext =~ s/^\s+//; 388 426 $tmptext =~ s/\s+$//; 427 $tmptext =~ s/\s+/ /gs; 428 $tmptext = substr ($tmptext, 0, 100); 389 429 $tmptext =~ s/\s\S*$/.../; 390 430 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); … … 392 432 } 393 433 394 # FIRST200: extract the first 200 characters as metadata 395 396 if ($field =~ /^first200$/i) { 397 my $tmptext = $$textref; 398 $tmptext =~ s/\s+/ /gs; 399 $tmptext =~ s/.*<body[^>]*>//i; 400 $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'}); 401 $tmptext =~ s/<[^>]*>//g; 402 $tmptext = substr ($tmptext, 0, 200); 403 $tmptext =~ s/^\s+//; 404 $tmptext =~ s/\s+$//; 405 $tmptext =~ s/\s\S*$/.../; 406 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 407 next; 408 } 409 410 # H1: extract the text between the first <H1> and </H1> tags 411 if ($field =~ /^H1$/i) { 412 my $tmptext = $$textref; 413 $tmptext =~ s/\s+/ /gs; 414 if ($tmptext =~ /<H1[^>]*>/i) { 415 $tmptext =~ s/.*<H1[^>]*>//i; 416 $tmptext =~ s/<\/H1[^>]*>.*//i; 417 $tmptext =~ s/^\s+//; 418 $tmptext =~ s/\s+$//; 419 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 420 } 421 next; 422 } 434 # tag: extract the text between the first <H1> and </H1> tags 435 if ($field =~ /^tag[a-z0-9]+$/i) { 436 my $tag = $field; 437 $tag =~ s/^tag//i; 438 my $tmptext = $$textref; 439 $tmptext =~ s/\s+/ /gs; 440 if ($tmptext =~ /<$tag[^>]*>/i) { 441 $tmptext =~ s/.*<$tag[^>]*>//i; 442 $tmptext =~ s/<\/tag[^>]*>.*//i; 443 $tmptext =~ s/<[^>]*>/ /g; 444 $tmptext =~ s/^\s+//; 445 $tmptext =~ s/\s+$//; 446 $tmptext =~ s/\s+/ /gs; 447 $doc_obj->add_utf8_metadata ($section, $tag, $tmptext); 448 } 449 next; 450 } 423 451 } 424 452 }
Note:
See TracChangeset
for help on using the changeset viewer.