Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1230

Timestamp:

2000-06-23T11:51:50+12:00 (24 years ago)

Author:

gwp

Message:

Added an additional H1 metadata field that extracts the text
between the the first <h1> and </H1> tags. Tidied up the
other metadata fields a little.

File:

: 1 edited

trunk/gsdl/perllib/plugins/HTMLPlug.pm (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r1220
+              r1230
     print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
     print STDERR "                          Defaults to 'Title'.\n";
+    print STDERR "                          Use `first200` to get the first 100 characters of the body.\n";
+    print STDERR "                          Use `first200` to get the first 200 characters of the body.\n";
+    print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
     print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
     print STDERR "                          w3mir \n";
 …
     foreach my $field (split /,/, $self->{'metadata_fields'}) {
     # don't need to extract field if it was passed in from a previous
     # (recursive) plugin
 …
+    }
+    # special case for Title metadata - try <title> tags
+    # then first 100 characters of text
+    # TITLE: extract the document title
     if ($field =~ /^title$/i) {
 …
             if ($title =~ /\w/) {
             $title =~ s/\s+/ /gs;
+            $title =~ s/^\s+//;
+            $title =~ s/\s+$//;
             $doc_obj->add_utf8_metadata ($section, $field, $title);
             next;
 …
         $tmptext =~ s/\s+/ /gs;
         $tmptext =~ s/<[^>]*>//g;
+        my $title = substr ($tmptext, 0, 100);
+        $doc_obj->add_utf8_metadata ($section, $field, $title);
+    }
+    # if the user requests the first chars as metadata the extract it
+        $tmptext = substr ($tmptext, 0, 100);
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s\S*$/.../;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        next;
+    }
+    # FIRST200: extract the first 200 characters as metadata
     if ($field =~ /^first200$/i) {
 …
         $tmptext =~ s/<[^>]*>//g;
         $tmptext = substr ($tmptext, 0, 200);
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
         $tmptext =~ s/\s\S*$/.../;
         $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        next;
+    }
+    # H1: extract the text between the first <H1> and </H1> tags
+    if ($field =~ /^H1$/i) {
+        my $tmptext = $$textref;
+        $tmptext =~ s/\s+/ /gs;
+        $tmptext =~ s/.*<H1[^>]*>//i;
+        $tmptext =~ s/<\/H1[^>]*>.*//i;
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        next;
+    }
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1230

Legend:

trunk/gsdl/perllib/plugins/HTMLPlug.pm

Download in other formats: