Context Navigation

← Previous Change
Next Change →

Changeset 1243 for trunk/gsdl

Timestamp:

2000-06-27T09:38:51+12:00 (24 years ago)

Author:

sjboddie

Message:

Caught HTMLPlug up with BasPlug. A few minor changes to some supporting
files (for new BasPlug options).

Location:

trunk/gsdl

Files:

: 4 edited

bin/script/import.pl (modified) (1 diff)
perllib/mgbuilder.pm (modified) (1 diff)
perllib/plugin.pm (modified) (2 diffs)
perllib/plugins/HTMLPlug.pm (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/bin/script/import.pl

r1031	r1243
135	135
136	136	# load all the plugins
137		$pluginfo = &plugin::load_plugins ($plugins);
	137	$pluginfo = &plugin::load_plugins ($plugins, $verbosity);
138	138	if (scalar(@$pluginfo) == 0) {
139	139	print STDERR "No plugins were loaded.\n";

trunk/gsdl/perllib/mgbuilder.pm

r1072	r1243
99	99
100	100	# load all the plugins
101		$self->{'pluginfo'} = &plugin::load_plugins ($plugins);
	101	$self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity);
102	102	if (scalar(@{$self->{'pluginfo'}}) == 0) {
103	103	print STDERR "No plugins were loaded.\n";

trunk/gsdl/perllib/plugin.pm

-              r835
+              r1243
 sub load_plugins {
     my ($plugin_list) = @_;
+    my ($plugin_list, $verbosity) = @_;
     my @plugin_objects = ();
+    $verbosity = 2 unless defined $verbosity;
     foreach $pluginoptions (@$plugin_list) {
 …
     die "$@" if $@;
+    # initialize plugin
+    $plugobj->init($verbosity);
     # add this object to the list
     push (@plugin_objects, $plugobj);

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r1231
+              r1243
     print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
     print STDERR "  options:\n";
-    print STDERR "   -process_exp           A perl regular expression to match against filenames.\n";
-    print STDERR "                          Matching filenames will be processed by this plugin.\n";
-    print STDERR "                          Defaults to '(?i)\.html?\$' i.e. all documents ending in\n";
-    print STDERR "                          .htm or .html (case-insensitive).\n";
     print STDERR "   -nolinks               Don't make any attempt to trap links (setting this flag may\n";
     print STDERR "                          improve speed of building/importing but any relative links within\n";
     print STDERR "                          documents will be broken).\n";
-    print STDERR "   -block_exp             Files matching this regular expression will be blocked from\n";
-    print STDERR "                          being passed to any further plugins in the list. By default\n";
-    print STDERR "                          HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png, .pdf,\n";
-    print STDERR "                          .rtf or .css file extensions.\n";
     print STDERR "   -keep_head             Don't remove headers from html files.\n";
     print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
 …
 sub new {
     my $class = shift (@_);
     my $self = new BasPlug (@_);
+    my $self = new BasPlug ("HTMLPlug", @_);
     if (!parsargv::parse(\@_,
-             q^process_exp/.*/(?i)\.html?$^, \$self->{'process_exp'},
              q^nolinks^, \$self->{'nolinks'},
-             q^block_exp/.*/(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^, \$self->{'block_exp'},
              q^keep_head^, \$self->{'keep_head'},
              q^no_metadata^, \$self->{'no_metadata'},
 …
              q^rename_assoc_files^, \$self->{'rename_assoc_files'},
              "allow_extra_options")) {
     &print_usage();
     die "\n";
+    }
     $self->{'aux_files'} = {};
     $self->{'dir_num'} = 0;
     $self->{'file_num'} = 0;
     return bless $self, $class;
+}
+sub is_recursive {
+    my $self = shift (@_);
+    return 0; # this is not a recursive plugin
+}
+# return number of files processed, undef if can't process
+# Note that $base_dir might be "" and that $file might
+# include directories
+sub read {
+    my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
+    my $filename = &util::filename_cat($base_dir, $file);
+    return 0 if $filename =~ /$self->{'block_exp'}/;
+    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
+    return undef;
+    }
+    $file =~ s/^[\/\\]+//;
+    $self->{'verbosity'} = $processor->{'verbosity'};
+sub get_default_block_exp {
+    my $self = shift (@_);
+    return q^(?i)\.(gif|jpe?g|png|pdf|rtf|css)$^;
+}
+sub get_default_process_exp {
+    my $self = shift (@_);
+    return q^(?i)\.html?$^;
+}
+# do plugin specific processing of doc_obj
+sub process {
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
     print STDERR "HTMLPlug: processing $file\n"
     if $self->{'verbosity'} > 1;
-    # create a new document
-    my $doc_obj = new doc ($file, "indexed_doc");
     my $cursection = $doc_obj->get_top_section();
+    # read in HTML file ($text will be in utf8)
+    my $text = "";
+    $self->read_file ($filename, \$text);
+    if ($text !~ /\w/) {
+    print STDERR "HTMLPlug: ERROR: $file contains no text\n" if $self->{'verbosity'};
+    return 0;
+    }
+    $self->extra_metadata ($doc_obj, $cursection, $metadata);
+    $self->extract_metadata (\$text, $metadata, $doc_obj, $cursection)
+    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection)
     unless $self->{'no_metadata'};
 …
     # remove header and footer
     if (!$self->{'keep_head'}) {
     $text =~ s/^.*?<body[^>]*>//is;
     $text =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+    $$textref =~ s/^.*?<body[^>]*>//is;
+    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
+    }
 …
     # usemap="./#index" not handled correctly => change to "#index"
     $text =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
+    $$textref =~ s/(<img[^>]*?usemap\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
         $self->replace_usemap_links($1, $2, $3)/isge;
     $text =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
+    $$textref =~ s/(<(?:a|area|frame|link)\s+[^>]*?(?:href|src)\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
+    }
     # trap images
     $text =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
+    $$textref =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
-    $doc_obj->add_utf8_text ($cursection, $text);
-    # add an OID
-    $doc_obj->set_OID();
-    # process the document
-    $processor->process($doc_obj);
-    return 1; # processed the file
+}

Note: See TracChangeset for help on using the changeset viewer.