Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1602

Timestamp:

2000-10-14T20:38:53+13:00 (24 years ago)

Author:

say1

Message:

metadata extraction work. (email addresses, generalised HTML tags, first N characters etc)

Location:

trunk/gsdl/perllib/plugins

Files:

: 2 edited

BasPlug.pm (modified) (7 diffs)
HTMLPlug.pm (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1424
+              r1602
     print STDERR "   -markup_acronyms  Added acronym metadata into document text\n\n";
     print STDERR "   -extract_langauge Identify the language of the text and set as metadata\n\n";
+    print STDERR "   -first            Comma seperated list of first sizes to extract from the text \n";
+    print STDERR "                     into a metadata field. The fields are called 'FirstNNN'.\n";
+    print STDERR "                     Defualts to '-first 200'. '-first 1000' also useful.\n";
+    print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
+}
 …
              q^block_exp/.*/^, \$self->{'block_exp'},
              q^extract_acronyms^, \$self->{'extract_acronyms'},
+             q^extract_email^, \$self->{'extract_email'},
              q^markup_acronyms^, \$self->{'markup_acronyms'},
              q^extract_language^, \$self->{'extract_language'},
+             q^first/.*/200^, \$self->{'first'},
              q^date_extract^, \$self->{'date_extract'},
              "maximum_date/\\d{4}/$year", \$self->{'max_year'},
 …
+}
+# extract acronyms (and hopefully other stuff soon too).
+# FIRSTNNN: extract the first NNN characters as metadata
+sub extract_first_NNNN_characters {
+    my $self = shift (@_);
+    my ($textref, $doc_obj, $thissection) = @_;
+    foreach my $size (split /,/, $self->{'first'}) {
+    my $tmptext =  $$textref;
+    $tmptext =~ s/^\s+//;
+    $tmptext =~ s/\s+$//;
+    $tmptext =~ s/\s+/ /gs;
+    $tmptext = substr ($tmptext, 0, $size);
+    $tmptext =~ s/\s\S*$/&#8230;/;
+    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
+    }
+}
+sub extract_email {
+    my $self = shift (@_);
+    my ($textref, $doc_obj, $thissection) = @_;
+    my $outhandle = $self->{'outhandle'};
+    print $outhandle " extracting email addresses ...\n"
+    if ($self->{'verbosity'} >= 2);
+    my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g);
+    @email = sort @email;
+    my @email2 = ();
+    foreach my $address (@email) {
+    if (!(join(" ",@email2) =~ m/$address/ )) {
+        push @email2, $address;
+        $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address);
+        print $outhandle "  extracting $address\n"
+        if ($self->{'verbosity'} >= 3);
+    }
+    }
+    print $outhandle " done extracting email addresses.\n"
+    if ($self->{'verbosity'} >= 2);
+}
+# extract metadata
 sub auto_extract_metadata {
     my $self = shift (@_);
     my ($doc_obj) = @_;
+    if ($self->{'extract_email'}) {
+    my $thissection = $doc_obj->get_top_section();
+    while (defined $thissection) {
+        my $text = $doc_obj->get_text($thissection);
+        $self->extract_email (\$text, $doc_obj, $thissection) if $text =~ /./;
+        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
+    if ($self->{'first'}) {
+    my $thissection = $doc_obj->get_top_section();
+    while (defined $thissection) {
+        my $text = $doc_obj->get_text($thissection);
+        $self->extract_first_NNNN_characters (\$text, $doc_obj, $thissection) if $text =~ /./;
+        $thissection = $doc_obj->get_next_section ($thissection);
+    }
+    }
     if ($self->{'extract_acronyms'}) {
     my $thissection = $doc_obj->get_top_section();
 …
+    }
+    }
     if ($self->{'markup_acronyms'}) {
     my $thissection = $doc_obj->get_top_section();
 …
     my $previous_data = $doc_obj->get_metadata($thissection, "Acronym");
     foreach my $thisAcro (@$previous_data) {
+        if ($thisAcro eq $acro->to_string())
+        {
+        if ($thisAcro eq $acro->to_string()) {
         $seen_before = "true";
         print $outhandle "  already seen ". $acro->to_string() . "\n"
             if ($self->{'verbosity'} >= 2);
+            if ($self->{'verbosity'} >= 4);
+        }
+    }
+    if ($seen_before eq "false")
+    {
+    if ($seen_before eq "false") {
         #write it to the file ...
         $acro->write_to_file();
 …
         #do the normal acronym
         $doc_obj->add_utf8_metadata($thissection, "Acronym",  $acro->to_string());
         print $outhandle "  adding ". $acro->to_string() . "\n"
             if ($self->{'verbosity'} >= 1);
+        print $outhandle "  adding ". $acro->to_string() . "\n"
+        if ($self->{'verbosity'} >= 3);
-#       # do the KWIC (Key Word In Context) acronym
-#       my @kwic = $acro->to_string_kwic();
-#       foreach my $kwic (@kwic) {
-#       $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC",  $kwic);
-#       print STDERR "   adding ".  $kwic . "\n"
-#           if ($self->{'verbosity'} >= 2);
-#       }
+    }
+    }
 …
 ;

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r1448
+              r1602
     print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
     print STDERR "                          Defaults to 'Title'.\n";
+    print STDERR "                          Use `first200` to get the first 200 characters of the body.\n";
+    print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
+    print STDERR "                          Use 'tag<tagname>' to have the contents of the first <tagname>\n";
+    print STDERR "                          pair put in a metadata element called 'tagname' Capitalise \n";
+    print STDERR "                          'tagname' as you want the metadata capitalised in the GML \n";
+    print STDERR "                          file, since the tag extraction is case insensitive.\n";
+    print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and place it \n";
+    print STDERR "                          in the 'Creator' field. Requires the -metadata_fields flag.\n ";
     print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
     print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
 …
              q^no_metadata^, \$self->{'no_metadata'},
              q^metadata_fields/.*/Title^, \$self->{'metadata_fields'},
+             q^hunt_creator_metadata^, \$self->{'hunt_creator_metadata'},
              q^w3mir^, \$self->{'w3mir'},
              q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'},
 …
     ##### possible - the following line should probably be deleted if that can be done
     return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
     if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
 …
+    }
+}
+sub extract_first_NNNN_characters {
+    my $self = shift (@_);
+    my ($textref, $doc_obj, $thissection) = @_;
+    foreach my $size (split /,/, $self->{'first'}) {
+    my $tmptext =  $$textref;
+    $tmptext =~ s/.*<body[^>]*>//i;
+    $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
+    $tmptext =~ s/<[^>]*>/ /g;
+    $tmptext =~ s/&nbsp;/ /g;
+    $tmptext =~ s/^\s+//;
+    $tmptext =~ s/\s+$//;
+    $tmptext =~ s/\s+/ /gs;
+    $tmptext = substr ($tmptext, 0, $size);
+    $tmptext =~ s/\s\S*$/&#8230;/;
+    $doc_obj->add_utf8_metadata ($thissection, "First$size", $tmptext);
+    }
+}
 sub extract_metadata {
 …
     my ($textref, $metadata, $doc_obj, $section) = @_;
+    # if we don't want metadata, we may as well not be here ...
     return if (!defined $self->{'metadata_fields'});
+    # hunt for an author
+    if (defined $self->{'hunt_creator_metadata'}) {
+    for my $name (split /,/, "AUTHOR,CREATOR,DC.CREATOR,DC.CREATOR.CORPORATENAME") {
+        if ($$textref =~ /<meta(\s*?)(?:name|http-equiv)\s*=\s*\"?$name\"?([^>]*)/is) {
+        my $content = $1 . $2;
+        if ($content =~ /content\s*=\s*\"?(.*)\"?/is) {
+            if (defined $1) {
+            my $value = $1;
+            $value =~ s/\"$//;
+            $value =~ s/\s+/ /gs;
+            print "adding Creator of $value\n";
+            $doc_obj->add_utf8_metadata($section, "Creator", $value);
+            }
+        }
+        }
+    }
+    }
     foreach my $field (split /,/, $self->{'metadata_fields'}) {
 …
             $value =~ s/\"$//;
             $value =~ s/\s+/ /gs;
+            $value =~ s/\".*//gs;
             $doc_obj->add_utf8_metadata($section, $field, $value);
             next;
 …
+    }
+    # TITLE: extract the document title
+    # TITLE: extract the document title
     if ($field =~ /^title$/i) {
         # see if there's a <title> tag
         if ($$textref =~ /<title[^>]*>([^<]*)<\/title[^>]*>/is) {
 …
         # if no title use first 100 characters
         my $tmptext = $$textref;
-        $tmptext =~ s/\s+/ /gs;
         $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
+        $tmptext =~ s/<[^>]*>//g;
+        $tmptext = substr ($tmptext, 0, 100);
+        $tmptext =~ s/<[^>]*>/ /g;
         $tmptext =~ s/^\s+//;
         $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s+/ /gs;
+        $tmptext = substr ($tmptext, 0, 100);
         $tmptext =~ s/\s\S*$/.../;
         $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
 …
+    }
+    # FIRST200: extract the first 200 characters as metadata
+    if ($field =~ /^first200$/i) {
+        my $tmptext = $$textref;
+        $tmptext =~ s/\s+/ /gs;
+        $tmptext =~ s/.*<body[^>]*>//i;
+        $tmptext =~ s/$self->{'title_sub'}// if (defined $self->{'title_sub'});
+        $tmptext =~ s/<[^>]*>//g;
+        $tmptext = substr ($tmptext, 0, 200);
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s\S*$/.../;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        next;
+    }
+    # H1: extract the text between the first <H1> and </H1> tags
+    if ($field =~ /^H1$/i) {
+        my $tmptext = $$textref;
+        $tmptext =~ s/\s+/ /gs;
+        if ($tmptext =~ /<H1[^>]*>/i) {
+        $tmptext =~ s/.*<H1[^>]*>//i;
+        $tmptext =~ s/<\/H1[^>]*>.*//i;
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        }
+        next;
+    }
+        # tag: extract the text between the first <H1> and </H1> tags
+        if ($field =~ /^tag[a-z0-9]+$/i) {
+        my $tag = $field;
+        $tag =~ s/^tag//i;
+            my $tmptext = $$textref;
+            $tmptext =~ s/\s+/ /gs;
+            if ($tmptext =~ /<$tag[^>]*>/i) {
+                $tmptext =~ s/.*<$tag[^>]*>//i;
+                $tmptext =~ s/<\/tag[^>]*>.*//i;
+        $tmptext =~ s/<[^>]*>/ /g;
+                $tmptext =~ s/^\s+//;
+                $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s+/ /gs;
+                $doc_obj->add_utf8_metadata ($section, $tag, $tmptext);
+            }
+            next;
+        }
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1602

Legend:

trunk/gsdl/perllib/plugins/BasPlug.pm

trunk/gsdl/perllib/plugins/HTMLPlug.pm

Download in other formats: