Changeset 1602


Ignore:
Timestamp:
2000-10-14T20:38:53+13:00 (24 years ago)
Author:
say1
Message:

metadata extraction work. (email addresses, generalised HTML tags, first N characters etc)

Location:
trunk/gsdl/perllib/plugins
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1424 r1602  
    6969    print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
    7070    print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
     71    print STDERR "   -first            Comma seperated list of first sizes to extract from the text \n";
     72    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n";
     73    print STDERR "                     Defualts to '-first 200'. '-first 1000' also useful.\n";
     74    print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
    7175}
    7276
     
    9397             q^block_exp/.*/^, \$self->{'block_exp'},
    9498             q^extract_acronyms^, \$self->{'extract_acronyms'},
     99             q^extract_email^, \$self->{'extract_email'},
    95100             q^markup_acronyms^, \$self->{'markup_acronyms'},
    96101             q^extract_language^, \$self->{'extract_language'},
     102             q^first/.*/200^, \$self->{'first'},
    97103             q^date_extract^, \$self->{'date_extract'},
    98104             "maximum_date/\\d{4}/$year", \$self->{'max_year'},
     
    316322}
    317323
    318 # extract acronyms (and hopefully other stuff soon too).
     324# FIRSTNNN: extract the first NNN characters as metadata
     325sub extract_first_NNNN_characters {
     326    my $self = shift (@_);
     327    my ($textref, $doc_obj, $thissection) = @_;
     328   
     329    foreach my $size (split /,/, $self->{'first'}) {
     330    my $tmptext =  $$textref;
     331    $tmptext =~ s/^\s+//;
     332    $tmptext =~ s/\s+$//;
     333    $tmptext =~ s/\s+/ /gs;
     334    $tmptext = substr ($tmptext, 0, $size);
     335    $tmptext =~ s/\s\S*$/…/;
     336    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
     337    }
     338}
     339
     340sub extract_email {
     341    my $self = shift (@_);
     342    my ($textref, $doc_obj, $thissection) = @_;
     343    my $outhandle = $self->{'outhandle'};
     344
     345    print $outhandle " extracting email addresses ...\n"
     346    if ($self->{'verbosity'} >= 2);
     347   
     348    my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
     349    @email = sort @email;
     350   
     351    my @email2 = ();
     352    foreach my $address (@email) {
     353    if (!(join(" ",@email2) =~ m/$address/ )) {
     354        push @email2, $address;
     355        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
     356        print $outhandle "  extracting $address\n"
     357        if ($self->{'verbosity'} >= 3);
     358    }
     359    }
     360    print $outhandle " done extracting email addresses.\n"
     361    if ($self->{'verbosity'} >= 2);
     362
     363}
     364
     365# extract metadata
    319366sub auto_extract_metadata {
    320367    my $self = shift (@_);
    321368    my ($doc_obj) = @_;
    322 
     369   
     370    if ($self->{'extract_email'}) {
     371    my $thissection = $doc_obj->get_top_section();
     372    while (defined $thissection) {
     373        my $text = $doc_obj->get_text($thissection);
     374        $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
     375        $thissection = $doc_obj->get_next_section ($thissection);
     376    }
     377    }   
     378    if ($self->{'first'}) {
     379    my $thissection = $doc_obj->get_top_section();
     380    while (defined $thissection) {
     381        my $text = $doc_obj->get_text($thissection);
     382        $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
     383        $thissection = $doc_obj->get_next_section ($thissection);
     384    }
     385    }   
     386   
    323387    if ($self->{'extract_acronyms'}) {
    324388    my $thissection = $doc_obj->get_top_section();
     
    329393    }
    330394    }
    331 
     395   
    332396    if ($self->{'markup_acronyms'}) {
    333397    my $thissection = $doc_obj->get_top_section();
     
    411475    my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
    412476    foreach my $thisAcro (@$previous_data) {
    413         if ($thisAcro eq $acro->to_string())
    414         {
     477        if ($thisAcro eq $acro->to_string()) {
    415478        $seen_before = "true";
    416479        print $outhandle "  already seen ". $acro->to_string() . "\n"
    417             if ($self->{'verbosity'} >= 2);
     480            if ($self->{'verbosity'} >= 4);
    418481        }       
    419482    }
    420483
    421     if ($seen_before eq "false")
    422     {
     484    if ($seen_before eq "false") {
    423485        #write it to the file ...
    424486        $acro->write_to_file();
     
    426488        #do the normal acronym
    427489        $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
    428         print $outhandle "  adding ". $acro->to_string() . "\n"
    429             if ($self->{'verbosity'} >= 1);
     490        print $outhandle "  adding ". $acro->to_string() . "\n"
     491        if ($self->{'verbosity'} >= 3);
    430492       
    431 #       # do the KWIC (Key Word In Context) acronym
    432 #       my @kwic = $acro->to_string_kwic();
    433 #       foreach my $kwic (@kwic) {
    434 #       $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
    435 #       print STDERR "   adding ".  $kwic . "\n"
    436 #           if ($self->{'verbosity'} >= 2);
    437 #       }
    438493    }
    439494    }
     
    460515
    4615161;
     517
     518
     519
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r1448 r1602  
    5757    print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
    5858    print STDERR "                          Defaults to 'Title'.\n";
    59     print STDERR "                          Use `first200` to get the first 200 characters of the body.\n";
    60     print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
     59    print STDERR "                          Use 'tag<tagname>' to have the contents of the first <tagname>\n";
     60    print STDERR "                          pair put in a metadata element called 'tagname' Capitalise \n";
     61    print STDERR "                          'tagname' as you want the metadata capitalised in the GML \n";
     62    print STDERR "                          file, since the tag extraction is case insensitive.\n";
     63    print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and place it \n";
     64    print STDERR "                          in the 'Creator' field. Requires the -metadata_fields flag.\n ";
    6165    print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
    6266    print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
     
    7983             q^no_metadata^, \$self->{'no_metadata'},
    8084             q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
     85             q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
    8186             q^w3mir^, \$self->{'w3mir'},
    8287             q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'},
     
    204209    ##### possible - the following line should probably be deleted if that can be done
    205210    return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
    206 
    207211
    208212    if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
     
    334338    }
    335339}
     340sub extract_first_NNNN_characters {
     341    my $self = shift (@_);
     342    my ($textref, $doc_obj, $thissection) = @_;
     343   
     344    foreach my $size (split /,/, $self->{'first'}) {
     345    my $tmptext =  $$textref;
     346    $tmptext =~ s/.*<body[^>]*>//i;
     347    $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
     348    $tmptext =~ s/<[^>]*>/ /g;
     349    $tmptext =~ s/&nbsp;/ /g;
     350    $tmptext =~ s/^\s+//;
     351    $tmptext =~ s/\s+$//;
     352    $tmptext =~ s/\s+/ /gs;
     353    $tmptext = substr ($tmptext, 0, $size);
     354    $tmptext =~ s/\s\S*$/&#8230;/;
     355    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
     356    }
     357}
    336358
    337359sub extract_metadata {
     
    339361    my ($textref, $metadata, $doc_obj, $section) = @_;
    340362
     363    # if we don't want metadata, we may as well not be here ...
    341364    return if (!defined $self->{'metadata_fields'});
     365
     366    # hunt for an author
     367    if (defined $self->{'hunt_creator_metadata'}) {
     368    for my $name (split /,/, "AUTHOR,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
     369        if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
     370        my $content = $1 . $2;
     371        if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
     372            if (defined $1) {
     373            my $value = $1;
     374            $value =~ s/\"$//;
     375            $value =~ s/\s+/ /gs;
     376            print "adding Creator of $value\n";
     377            $doc_obj->add_utf8_metadata($section, "Creator", $value);
     378            }
     379        }
     380        }
     381    }
     382    }
    342383
    343384    foreach my $field (split /,/, $self->{'metadata_fields'}) {
     
    355396            $value =~ s/\"$//;
    356397            $value =~ s/\s+/ /gs;
     398            $value =~ s/\".*//gs;
    357399            $doc_obj->add_utf8_metadata($section, $field, $value);
    358400            next;
     
    361403    }
    362404   
    363     # TITLE: extract the document title
    364    
     405    # TITLE: extract the document title
    365406    if ($field =~ /^title$/i) {
    366 
    367407        # see if there's a <title> tag
    368408        if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
     
    381421        # if no title use first 100 characters
    382422        my $tmptext = $$textref;
    383         $tmptext =~ s/\s+/ /gs;
    384423        $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
    385         $tmptext =~ s/<[^>]*>//g;
    386         $tmptext = substr ($tmptext, 0, 100);
     424        $tmptext =~ s/<[^>]*>/ /g;
    387425        $tmptext =~ s/^\s+//;
    388426        $tmptext =~ s/\s+$//;
     427        $tmptext =~ s/\s+/ /gs;
     428        $tmptext = substr ($tmptext, 0, 100);
    389429        $tmptext =~ s/\s\S*$/.../;
    390430        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
     
    392432    }
    393433
    394     # FIRST200: extract the first 200 characters as metadata
    395 
    396     if ($field =~ /^first200$/i) {
    397         my $tmptext = $$textref;
    398         $tmptext =~ s/\s+/ /gs;
    399         $tmptext =~ s/.*<body[^>]*>//i;
    400         $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
    401         $tmptext =~ s/<[^>]*>//g;
    402         $tmptext = substr ($tmptext, 0, 200);
    403         $tmptext =~ s/^\s+//;
    404         $tmptext =~ s/\s+$//;
    405         $tmptext =~ s/\s\S*$/.../;
    406         $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
    407         next;
    408     }
    409 
    410     # H1: extract the text between the first <H1> and </H1> tags
    411     if ($field =~ /^H1$/i) {
    412         my $tmptext = $$textref;
    413         $tmptext =~ s/\s+/ /gs;
    414         if ($tmptext =~ /<H1[^>]*>/i) {
    415         $tmptext =~ s/.*<H1[^>]*>//i;
    416         $tmptext =~ s/<\/H1[^>]*>.*//i;
    417         $tmptext =~ s/^\s+//;
    418         $tmptext =~ s/\s+$//;
    419         $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
    420         }
    421         next;
    422     }
     434        # tag: extract the text between the first <H1> and </H1> tags
     435        if ($field =~ /^tag[a-z0-9]+$/i) {
     436        my $tag = $field;
     437        $tag =~ s/^tag//i;
     438            my $tmptext = $$textref;
     439            $tmptext =~ s/\s+/ /gs;
     440            if ($tmptext =~ /<$tag[^>]*>/i) {
     441                $tmptext =~ s/.*<$tag[^>]*>//i;
     442                $tmptext =~ s/<\/tag[^>]*>.*//i;
     443        $tmptext =~ s/<[^>]*>/ /g;
     444                $tmptext =~ s/^\s+//;
     445                $tmptext =~ s/\s+$//;
     446        $tmptext =~ s/\s+/ /gs;
     447                $doc_obj->add_utf8_metadata ($section, $tag, $tmptext);
     448            }
     449            next;
     450        }
    423451    }
    424452}
Note: See TracChangeset for help on using the changeset viewer.