Context Navigation

← Previous Change
Next Change →

HTMLPlug.pm

Timestamp:

2000-07-13T10:21:53+12:00 (24 years ago)

Author:

sjboddie

Message:

merged changes to trunk into New_Config_Format branch

File:

: 1 edited

branches/New_Config_Format-branch/gsdl/perllib/plugins/HTMLPlug.pm (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

branches/New_Config_Format-branch/gsdl/perllib/plugins/HTMLPlug.pm

-              r1020
+              r1279
 sub print_usage {
-    print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
     print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
     print STDERR "  options:\n";
-    print STDERR "   -process_exp           A perl regular expression to match against filenames.\n";
-    print STDERR "                          Matching filenames will be processed by this plugin.\n";
-    print STDERR "                          Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
-    print STDERR "                          .htm or .html (case-insensitive).\n";
     print STDERR "   -nolinks               Don't make any attempt to trap links (setting this flag may\n";
     print STDERR "                          improve speed of building/importing but any relative links within\n";
     print STDERR "                          documents will be broken).\n";
-    print STDERR "   -block_exp             Files matching this regular expression will be blocked from\n";
-    print STDERR "                          being passed to any further plugins in the list. By default\n";
-    print STDERR "                          HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";
-    print STDERR "                          .rtf or .css file extensions.\n";
     print STDERR "   -keep_head             Don't remove headers from html files.\n";
     print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
     print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to extract.\n";
+    print STDERR "                          Defaults to 'Title'\n";
+    print STDERR "                          Defaults to 'Title'.\n";
+    print STDERR "                          Use `first200` to get the first 200 characters of the body.\n";
+    print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
     print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
-    print STDERR "                          w3mir \n";
     print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
     print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)$'\n";
+    print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n";
     print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images). Also\n";
     print STDERR "                          creates much shallower directory structure (useful when creating\n";
 …
 sub new {
     my $class = shift (@_);
     my $self = new BasPlug ();
+    my $self = new BasPlug ("HTMLPlug", @_);
     if (!parsargv::parse(\@_,
-             q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},
              q^nolinks^, \$self->{'nolinks'},
-             q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},
              q^keep_head^, \$self->{'keep_head'},
              q^no_metadata^, \$self->{'no_metadata'},
 …
              q^w3mir^, \$self->{'w3mir'},
              q^assoc_files/.*/(?i)\.(jpe?g|gif|png|css|pdf)$^, \$self->{'assoc_files'},
+             q^rename_assoc_files^, \$self->{'rename_assoc_files'})) {
+             q^rename_assoc_files^, \$self->{'rename_assoc_files'},
+             "allow_extra_options")) {
+    print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
     &print_usage();
     die "\n";
+    }
     $self->{'aux_files'} = {};
     $self->{'dir_num'} = 0;
     $self->{'file_num'} = 0;
     return bless $self, $class;
+}
+sub is_recursive {
+    my $self = shift (@_);
+    return 0; # this is not a recursive plugin
+}
+# return number of files processed, undef if can't process
+# Note that $base_dir might be "" and that $file might
+# include directories
+sub read {
+    my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
+    my $filename = &util::filename_cat($base_dir, $file);
+    return 0 if $filename =~ /$self->{'block_exp'}/;
+    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
+    return undef;
+    }
+    $file =~ s/^[\/\\]+//;
+    $self->{'verbosity'} = $processor->{'verbosity'};
+sub get_default_block_exp {
+    my $self = shift (@_);
+    return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^;
+}
+sub get_default_process_exp {
+    my $self = shift (@_);
+    return q^(?i)\.html?$^;
+}
+# do plugin specific processing of doc_obj
+sub process {
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     print STDERR "HTMLPlug: processing $file\n"
     if $self->{'verbosity'} > 1;
-    # create a new document
-    my $doc_obj = new doc ($file, "indexed_doc");
     my $cursection = $doc_obj->get_top_section();
+    # read in HTML file
+    open (FILE, $filename) || die "HTMLPlug::read - can't open $filename\n";
+    undef $/;
+    my $text = <FILE>;
+    $/ = "\n";
+    close FILE;
+    if (!defined $text || $text !~ /\w/) {
+    print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'};
+    return 0;
+    }
+    $self->extra_metadata ($doc_obj, $cursection, $metadata);
+    $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection)
+    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
     unless $self->{'no_metadata'};
 …
     my $web_url = "http://$file";
     $web_url =~ s/\\/\//g; # for windows
     $doc_obj->add_metadata($cursection, "URL", $web_url);
+    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
     # remove header and footer
     if (!$self->{'keep_head'}) {
     $text =~ s/^.*?<body[^>]*>//is;
     $text =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+    $$textref =~ s/^.*?<body[^>]*>//is;
+    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+    }
 …
     # usemap="./#index" not handled correctly => change to "#index"
     $text =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
+    $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
         $self->replace_usemap_links($1, $2, $3)/isge;
     $text =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
+    $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*? (?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
+    }
     # trap images
     $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
+    $$textref =~ s/(<img[^>]*? src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
+    $doc_obj->add_text ($cursection, $text);
+    # add an OID
+    $doc_obj->set_OID();
+    # process the document
+    $processor->process($doc_obj);
+    return 1; # processed the file
+    # add text to document object
+    $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
+    return 1;
+}
 …
     foreach my $field (split /,/, $self->{'metadata_fields'}) {
     # don't need to extract field if it was passed in from a previous
     # (recursive) plugin
 …
             my $value = $1;
             $value =~ s/\s+/ /gs;
             $doc_obj->add_metadata($section, $field, $value);
+            $doc_obj->add_utf8_metadata($section, $field, $value);
             next;
+        }
 …
+    }
+    # special case for Title metadata - try <title> tags
+    # then first 100 characters of text
+    # TITLE: extract the document title
     if ($field =~ /^title$/i) {
 …
             if ($title =~ /\w/) {
             $title =~ s/\s+/ /gs;
+            $doc_obj->add_metadata ($section, $field, $title);
+            $title =~ s/^\s+//;
+            $title =~ s/\s+$//;
+            $doc_obj->add_utf8_metadata ($section, $field, $title);
             next;
+            }
 …
         # if no title use first 100 characters
         my $tmptext = $$textref;
+        $tmptext =~ s/\s+/ /gs;
         $tmptext =~ s/<[^>]*>//g;
+        my $title = substr ($tmptext, 0, 100);
+        $title =~ s/\s+/ /gs;
+        $doc_obj->add_metadata ($section, $field, $title);
+    }
+    }
+}
+        $tmptext = substr ($tmptext, 0, 100);
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s\S*$/.../;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        next;
+    }
+    # FIRST200: extract the first 200 characters as metadata
+    if ($field =~ /^first200$/i) {
+        my $tmptext = $$textref;
+        $tmptext =~ s/\s+/ /gs;
+        $tmptext =~ s/.*<body[^>]*>//i;
+        $tmptext =~ s/<[^>]*>//g;
+        $tmptext = substr ($tmptext, 0, 200);
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $tmptext =~ s/\s\S*$/.../;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        next;
+    }
+    # H1: extract the text between the first <H1> and </H1> tags
+    if ($field =~ /^H1$/i) {
+        my $tmptext = $$textref;
+        $tmptext =~ s/\s+/ /gs;
+        if ($tmptext =~ /<H1[^>]*>/i) {
+        $tmptext =~ s/.*<H1[^>]*>//i;
+        $tmptext =~ s/<\/H1[^>]*>.*//i;
+        $tmptext =~ s/^\s+//;
+        $tmptext =~ s/\s+$//;
+        $doc_obj->add_utf8_metadata ($section, $field, $tmptext);
+        }
+        next;
+    }
+    }
+}
 # evaluate any "../" to next directory up

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1279 for branches/New_Config_Format-branch/gsdl/perllib/plugins/HTMLPlug.pm

Legend:

branches/New_Config_Format-branch/gsdl/perllib/plugins/HTMLPlug.pm

Download in other formats: