Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1244

Timestamp:

2000-06-27T17:10:07+12:00 (24 years ago)

Author:

sjboddie

Message:

Caught up most general plugins (that's the ones in gsdlhome/perllib/plugins)
with changes to BasPlug so that they can all now use the new general plugin
options. Those I didn't do were FoxPlug (as it's not actually used anywhere
and I don't know what it does) and WebPlug (as it's kind of a work in
progress and doesn't really work anyway). All plugins will still work
(including all the collection specific ones that are laying around), some
of them just won't have access to the general options.
I also wrote a short perl script (pluginfo.pl) that prints out all the
options available to a given plugin.

Location:

Files:

: 1 added
: 12 edited

bin/script/pluginfo.pl (added)
perllib/acronym.pm (modified) (1 diff)
perllib/plugin.pm (modified) (1 diff)
perllib/plugins/ArcPlug.pm (modified) (3 diffs)
perllib/plugins/BasPlug.pm (modified) (13 diffs)
perllib/plugins/EMAILPlug.pm (modified) (8 diffs)
perllib/plugins/GMLPlug.pm (modified) (7 diffs)
perllib/plugins/HBPlug.pm (modified) (8 diffs)
perllib/plugins/HBSPlug.pm (modified) (4 diffs)
perllib/plugins/HTMLPlug.pm (modified) (4 diffs)
perllib/plugins/IndexPlug.pm (modified) (5 diffs)
perllib/plugins/RecPlug.pm (modified) (3 diffs)
perllib/plugins/TEXTPlug.pm (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/acronym.pm

r1242	r1244
27	27
28	28	use strict;
29		use diagnostics;
	29	#use diagnostics;
30	30
31	31	package acronym;

trunk/gsdl/perllib/plugin.pm

r1243	r1244
51	51	map { $_ = "\"$_\""; } @$pluginoptions;
52	52	my $options = join (",", @$pluginoptions);
	53	$options =~ s/\$/\\\$/g;
53	54	eval ("\$plugobj = new \$pluginname($options)");
54	55	die "$@" if $@;

trunk/gsdl/perllib/plugins/ArcPlug.pm

-              r809
+              r1244
+}
+use strict;
 sub new {
     my ($class) = @_;
     my $self = new BasPlug ();
+    my $self = new BasPlug ("ArcPlug", @_);
     return bless $self, $class;
 …
 sub read {
     my $self = shift (@_);
     ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
+    my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     my $count = 0;
     # see if this has a archives information file within it
     $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
+    my $archive_info_filename = &util::filename_cat($base_dir,$file,"archives.inf");
     if (-e $archive_info_filename) {
 …
     # process each file
     foreach $subfile (@$file_list) {
+    foreach my $subfile (@$file_list) {
         last if ($maxdocs != -1 && $count >= $maxdocs);

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1242
+              r1244
 use doc;
 sub print_usage {
+sub print_general_usage {
     my ($plugin_name) = @_;
-    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
-    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
     print STDERR "\n  usage: plugin $plugin_name [options]\n\n";
-    print STDERR "  currently supported general options are:\n";
     print STDERR "   -input_encoding   The encoding of the source documents. Documents will be\n";
     print STDERR "                     converted from these encodings and stored internally as\n";
 …
+}
+# print_usage should be overridden for any sub-classes having
+# their own plugin specific options
+sub print_usage {
+    print STDERR "\nThis plugin has no plugin specific options\n\n";
+}
 sub new {
     my $class = shift (@_);
 …
              q^extract_acronyms^, \$self->{'extract_acronyms'},
              "allow_extra_options")) {
+    &print_usage($plugin_name);
+    print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n";
+    print STDERR "available to all plugins). Check your collect.cfg configuration file.\n";
+        &print_general_usage($plugin_name);
     die "\n";
+    }
 …
     # set process_exp and block_exp to defaults unless they were
     # explicitly set
+    if ((!$self->is_recursive()) &&
+    if ((!$self->is_recursive()) and
     (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) {
     $self->{'process_exp'} = $self->get_default_process_exp ();
     if ($self->{'process_exp'} eq "") {
         warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n";
+        warn ref($self) . " Warning: Non-recursive plugin has no process_exp\n";
+    }
+    }
 …
     $self->{'block_exp'} = $self->get_default_block_exp ();
+    }
+    # handle input_encoding aliases
+    $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";
+    $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";
+}
 …
 # process() function and let this read() function keep control.
+#
+# recursive plugins (e.g. RecPlug) and specialized plugins like those
+# capable of processing many documents within a single file (e.g.
+# GMLPlug) should normally implement their own version of read()
+#
 # Return number of files processed, undef if can't process
 # Note that $base_dir might be "" and that $file might
 …
     my $filename = &util::filename_cat($base_dir, $file);
     return 0 if $filename =~ /$self->{'block_exp'}/;
+    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
     if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
     return undef;
 …
     # create a new document
     my $doc_obj = new doc ($file, "indexed_doc");
-    my $cursection =
     # read in file ($text will be in utf8)
 …
     # do plugin specific processing of doc_obj
+    $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj);
+    # add text
+    $doc_obj->add_utf8_text ($cursection, $text);
+    return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
     # do any automatic metadata extraction
 …
+}
+# returns undef if file is rejected by the plugin
 sub process {
     my $self = shift (@_);
 …
     die "Basplug::process function must be implemented in sub-class\n";
+    return undef; # never gets here
+}
 …
     $$textref = "";
-    my $encoding = "";
-    if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
-    $encoding = "iso_8859_1";
-    } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {
-    $encoding = "windows_1256";
-    } else {
-    $encoding = $self->{'input_encoding'};
+    }
     open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n";
     if ($encoding eq "ascii") {
+    if ($self->{'input_encoding'} eq "ascii") {
     undef $/;
     $$textref = <FILE>;
 …
     my $reader = new multiread();
     $reader->set_handle ('BasPlug::FILE');
     $reader->set_encoding ($encoding);
+    $reader->set_encoding ($self->{'input_encoding'});
     $reader->read_file ($textref);
     if ($encoding eq "gb") {
+    if ($self->{'input_encoding'} eq "gb") {
         # segment the Chinese words
         $$textref = &cnseg::segment($$textref);

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r1206
+              r1244
+}
+use strict;
 # Create a new EMAILPlug object with which to parse a file.
 …
 sub new {
     my ($class) = @_;
+    $self = new BasPlug ();
+    my $self = new BasPlug ("EMAILPlug", @_);
     return bless $self, $class;
+}
+# Is EMAILPlug recursive?  No.
+sub is_recursive {
+    return 0;
+}
+# Read a file and store its contents in a new document object.
+# First, we check to see if it is an email message we're dealing
+# with, then we extract the text and metadata, then we store
+# all this information.
+#
+# Returns: number of files processed or undef if it can't process
+# a file.  This plugin only processes one file at a time.
+sub read {
+sub get_default_process_exp {
     my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
+    #
+    # Check that we're dealig with a valid mail file
+    #
+    # Make sure file exists
+    my $filename = &util::filename_cat($base_dir, $file);
+    return undef unless (-e $filename);
+    return undef unless ($filename =~ /\d+(\.email)?$/);
+    # Read the text and make sure it is an email message
+    open (FILE, $filename) || die "EMAILPlug::read - can't open $filename\n";
+    my @text = <FILE>;
+    my $text = join("", @text);
+    return undef unless (($text =~ /From:/) || ($text =~ /To:/));
+    print STDERR "EMAILPlug: processing $filename\n" if $processor->{'verbosity'};
+    return q^\d+(\.email)?$^;
+}
+# do plugin specific processing of doc_obj
+sub process {
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
+    # Check that we're dealing with a valid mail file
+    return undef unless (($$textref =~ /From:/) || ($$textref =~ /To:/));
+    print STDERR "EMAILPlug: processing $file\n"
+    if $self->{'verbosity'} > 1;
+    my $cursection = $doc_obj->get_top_section();
+    #
 …
     # Separate header from body of message
     my $Headers = $text;
+    my $Headers = $$textref;
     $Headers =~ s/\n\n.*//s;
     $text = substr $text, (length $Headers);
+    $$textref = substr $$textref, (length $Headers);
     # Extract basic metadata from header
 …
+    #
+    # Create a new document object
+    #
+    my $doc_obj = new doc ($file, "indexed_doc");
+    my $cursection = $doc_obj->get_top_section();
+    # Add specilised metadata
+    # Add extracted metadata to document object
     foreach my $name (keys %raw) {
     $value = $raw{$name};
 …
         $value = "No $name field";
+    }
     $doc_obj->add_metadata ($cursection, $name, $value);
+    $doc_obj->add_utf8_metadata ($cursection, $name, $value);
+    }
 …
     $Headers = &text_into_html($Headers);
     $Headers = "No headers" unless ($Headers =~ /\w/);
+    $doc_obj->add_metadata ($cursection, "Headers", $Headers);
+    # Add document text
+    $text = &text_into_html($text);
+    $text = "No message" unless ($text =~ /\w/);
+    $doc_obj->add_text ($cursection, $text);
+    # Add the OID - that is, the big HASH value used as a unique ID
+    $doc_obj->set_OID ();
+    # Process the document
+    $processor->process($doc_obj);
+    # Return the number of documents processed
+    return 1;
+    $doc_obj->add_utf8_metadata ($cursection, "Headers", $Headers);
+    # Add text to document object
+    $$textref = &text_into_html($$textref);
+    $$textref = "No message" unless ($$textref =~ /\w/);
+    $doc_obj->add_utf8_text($cursection, $$textref);
+    return 1;
+}
 …
     my ($text) = @_;
     # Convert problem charaters into HTML symbols
+    # Convert problem characters into HTML symbols
     $text =~ s/&/&amp;/go;
     $text =~ s/</&lt;/go;
 …
 # Perl packages have to return true if they are run.
 ;

trunk/gsdl/perllib/plugins/GMLPlug.pm

-              r1010
+              r1244
+}
+use strict;
 sub new {
     my ($class) = @_;
     $self = new BasPlug ();
+    my $self = new BasPlug ("GMLPlug", @_);
     return bless $self, $class;
+}
+sub is_recursive {
+sub get_default_process_exp {
     my $self = shift (@_);
+    return 0; # this is not a recursive plugin
+}
+sub _unescape_text {
+    my ($text) = @_;
+    # special characters in the gml encoding
+    $text =~ s/&lt;/</g;
+    $text =~ s/&gt;/>/g;
+    $text =~ s/&quot;/\"/g;
+    $text =~ s/&amp;/&/g; # this has to be last...
+    return $text;
+    return q^(?i)\.gml(\.gz)?$^;
+}
 …
     my $self = shift (@_);
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
+    my $fullname = &util::filename_cat ($base_dir, $file);
+    # see if this is a gml book
+    return undef unless (-f $fullname && $fullname =~ /\.gml(\.gz)?$/io);
+    my ($parent_dir, $gz) = $fullname =~ /^(.*?)[\/\\][^\/\\]+.gml(\.gz)?$/io;
+    if (defined $gz && $gz =~ /\.gz/io) {
+    my $filename = &util::filename_cat($base_dir, $file);
+    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
+    if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
+    return undef;
+    }
+    $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
+    print STDERR "GMLPlug: processing $file\n";
+    my $parent_dir = $file;
+    $parent_dir =~ s/[^\\\/]*$//;
+    $parent_dir = &util::filename_cat ($base_dir, $parent_dir);
+    # all this gzip stuff should one day be replaced by a gzip/bzip/zip/tar
+    # handling plugin
+    my $gz = 0;
+    if ($file =~ /\.gz$/i) {
     $gz = 1;
-    } else {
-    $gz = 0;
+    }
+    print STDERR "GMLPlug: processing $file\n";
+    # read in the document
+    # read in the document - input is assumed throughout this plugin to already be utf8
     if ($gz) {
     if (!open (INFILE, "zcat $fullname |")) {
         print STDERR "GMLPlug::read - zcat couldn't read $fullname\n";
         return undef;
+    if (!open (INFILE, "zcat $filename |")) {
+        print STDERR "GMLPlug::read - zcat couldn't read $filename\n";
+        return 0;
+    }
     } else {
     if (!open (INFILE, $fullname)) {
         print STDERR "GMLPlug::read - couldn't read $fullname\n";
         return undef;
+    if (!open (INFILE, $filename)) {
+        print STDERR "GMLPlug::read - couldn't read $filename\n";
+        return 0;
+    }
+    }
 …
     my $no_docs = 0;
-#    my $src_filename = ""; #### don't appear to use this anymore - not sure if that's right
     while (1) {
 …
         } else {
             print STDERR "GMLPlug::read - error in file $fullname\n";
+            print STDERR "GMLPlug::read - error in file $filename\n";
             print STDERR "text: \"$gml\"\n";
             last;
 …
         last if $section eq ""; # back to top level again (more than one document in gml file)
         $section = $doc_obj->get_parent_section ($section);
     } #while (1) section level
+    } # while (1) section level
     # add the associated files
     $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
+    my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
     my ($assoc_file_info, $afile);
     foreach $assoc_file_info (@$assoc_files) {
 …
     $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
+    # assume the document has an OID
+    # do any automatic metadata extraction
+    $self->auto_extract_metadata ($doc_obj);
+    # assume the document has an OID already
     # process the document
 …
     last if ($maxdocs > -1 && $no_docs >= $maxdocs);
     last unless defined $gml && $gml =~ /\w/;
     } #while(1) document level
+    } # while(1) document level
     return $no_docs; # no of docs processed
+}
+sub _unescape_text {
+    my ($text) = @_;
+    # special characters in the gml encoding
+    $text =~ s/&lt;/</g;
+    $text =~ s/&gt;/>/g;
+    $text =~ s/&quot;/\"/g;
+    $text =~ s/&amp;/&/g; # this has to be last...
+    return $text;
+}
 ;

trunk/gsdl/perllib/plugins/HBPlug.pm

-              r1020
+              r1244
 ###########################################################################
+# plugin which process an HTML book directory
+# plugin which processes an HTML book directory
+# This plugin is used by the Humanity Library collections and does not handle
+# input encodings other than ascii or extended ascii
+# this code is kind of ugly and could no doubt be made to run faster, by leaving
+# it in this state I hope to encourage people to make their collections use
+# HBSPlug instead ;-)
+# Use HBSPlug if creating a new collection and marking up files like the
+# Humanity Library collections. HBSPlug accepts all input encodings but
+# expects the marked up files to be cleaner than those used by the
+# Humanity Library collections
 package HBPlug;
-use plugin;
 use ghtml;
 use BasPlug;
 use util;
-use lang;
 use doc;
-use cfgread;
 …
 sub new {
     my ($class) = @_;
     $self = new BasPlug ();
+    my $self = new BasPlug ("HBPlug", @_);
     return bless $self, $class;
+}
+sub is_recursive {
+    my $self = shift (@_);
+    return 0; # this is not a recursive plugin
+}
+sub init {
+    my $self = shift (@_);
+    my ($verbosity) = @_;
+    $self->BasPlug::init();
+    # this plugin only handles ascii encodings
+    if ($self->{'input_encoding'} !~ /^(iso_8859_1|ascii)$/) {
+    die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" .
+        $self->{'input_encoding'} . " is not an acceptable input_encoding value\n";
+    }
+}
+# this is included only to prevent warnings being printed out
+# from BasPlug::init. The process_exp is not used by this plugin
+sub get_default_process_exp {
+    my $self = shift (@_);
+    return "This plugin does not use a process_exp\n";
+}
 sub HB_read_html_file {
 …
     my $foundbody = 0;
     $self->HB_gettext (\$foundbody, $text, FILE);
+    $self->HB_gettext (\$foundbody, $text, "FILE");
     close FILE;
 …
     $foundbody = 1;
     open (FILE, $htmlfile) || return;
     $self->HB_gettext (\$foundbody, $text, FILE);
+    $self->HB_gettext (\$foundbody, $text, "FILE");
     close FILE;
+    }
 …
+}
+# if input_encoding is ascii we can call add_utf8_metadata
+# directly but if it's iso_8859_1 (the default) we need to call
+# add_metadata so that the ascii2utf8 conversion is done first
+# this should speed things up a little if processing an ascii only
+# document with input_encoding set to ascii
+sub HB_add_metadata {
+    my $self = shift (@_);
+    my ($doc_obj, $cursection, $field, $value) = @_;
+    if ($self->{'input_encoding'} eq "ascii") {
+    $doc_obj->add_utf8_metadata ($cursection, $field, $value);
+    } else {
+    $doc_obj->add_metadata ($cursection, $field, $value);
+    }
+}
 # return number of files processed, undef if can't process
 …
     # add metadata for top level of document
     foreach $field (keys(%$metadata)) {
+    foreach my $field (keys(%$metadata)) {
     # $metadata->{$field} may be an array reference
     if (ref ($metadata->{$field}) eq "ARRAY") {
         map {
         $doc_obj->add_metadata ($cursection, $field, $_);
+        $self->HB_add_metadata ($doc_obj, $cursection, $field, $_);
         } @{$metadata->{$field}};
     } else {
         $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
+        $self->HB_add_metadata ($doc_obj, $cursection, $field, $metadata->{$field});
+    }
+    }
 …
         # add the metadata to this section
         $doc_obj->add_metadata ($cursection, "Title", $title);
+        $self->HB_add_metadata ($doc_obj, $cursection, "Title", $title);
         # clean up the section html
 …
         # add the text for this section
+        $doc_obj->add_text ($cursection, $sectiontext);
+        if ($self->{'input_encoding'} eq "ascii") {
+        $doc_obj->add_utf8_text ($cursection, $sectiontext);
+        } else {
+        $doc_obj->add_text ($cursection, $sectiontext);
+        }
     } else {
         print STDERR "WARNING - leftover text\n" , $self->shorten($html),

trunk/gsdl/perllib/plugins/HBSPlug.pm

-              r1235
+              r1244
 # processing of html links or any other HTMLPlug type stuff is done).
+# expects input files to have a .hb file extension
+# expects input files to have a .hb file extension by default (this can be
+# changed by adding a -process_exp option
 # a file with the same name as the hb file but a .jpg extension is
 # taken as the cover image
+# taken as the cover image (jpg files are blocked by this plugin)
 # HBSPlug is a simplification (and extension of) the HBPlug used
 …
 sub new {
     my ($class) = @_;
     my $self = new BasPlug (@_);
+    my $self = new BasPlug ("HBSPlug", @_);
     return bless $self, $class;
+}
 sub is_recursive {
+sub get_default_block_exp {
     my $self = shift (@_);
+    return 0; # this is not a recursive plugin
+}
+# return number of files processed, undef if can't process
+# Note that $base_dir might be "" and that $file might
+# include directories
+sub read {
+    return q^\.jpg$^;
+}
+sub get_default_process_exp {
     my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
+    return q^(?i)\.hb$^;
+}
+# do plugin specific processing of doc_obj
+sub process {
+    my $self = shift (@_);
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
+    print STDERR "HBSPlug: processing $file\n"
+    if $self->{'verbosity'} > 1;
+    my $cursection = $doc_obj->get_top_section();
     my $filename = &util::filename_cat($base_dir, $file);
 …
     $absdir =~ s/[^\/\\]*$//;
-    return 0 if ($filename =~ /\.jpg$/i);
-    return undef unless ($filename =~ /\.hb$/i && (-e $filename));
-    print STDERR "HBSPlug: processing $filename\n" if $processor->{'verbosity'};
-    # create a new document
-    my $doc_obj = new doc ($file, "indexed_doc");
-    my $cursection = $doc_obj->get_top_section();
     # add the cover image
     my $coverimage = $filename;
     $coverimage =~ s/\.hb/\.jpg/i;
+    $coverimage =~ s/\.[^\.]*$/\.jpg/i;
     $doc_obj->associate_file($coverimage, "cover.jpg", "image/jpeg");
-    # add metadata for top level of document
-    $self->extra_metadata ($doc_obj, $cursection, $metadata);
-    # read in HTML file ($text will be in utf8)
-    my $text = "";
-    $self->read_file ($filename, \$text);
     my $title = "";
     # remove any leading rubbish
     $text =~ s/^.*?(<<TOC)/$1/ios;
+    $$textref =~ s/^.*?(<<TOC)/$1/ios;
     my $curtoclevel = 1;
     my $firstsection = 1;
     my $toccount = 0;
     while ($text =~ /\w/) {
     $text =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
+    while ($$textref =~ /\w/) {
+    $$textref =~ s/^<<TOC(\d+)>>([^\n]*)\n(.*?)(<<TOC|\Z)/$4/ios;
     my $toclevel = $1;
     my $metadata = $2;
 …
     $firstsection = 0;
+    $text =~ s/^\s+//s;
+    }
+    # add OID
+    $doc_obj->set_OID ();
+    # process the document
+    $processor->process($doc_obj);
+    return 1; # processed the file
+    $$textref =~ s/^\s+//s;
+    }
+    return 1;
+}

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r1243
+              r1244
 sub print_usage {
-    print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
     print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
     print STDERR "  options:\n";
 …
     print STDERR "                          Use `H1` to get the text inside the first <H1> and </H1> tags in the text.\n";
     print STDERR "   -w3mir                 Set if w3mir was used to generate input file structure.\n";
-    print STDERR "                          w3mir \n";
     print STDERR "   -assoc_files           Perl regular expression of file extensions to associate with\n";
     print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)$'\n";
+    print STDERR "                          html documents. Defaults to '(?i)\.(jpe?g|gif|png|css|pdf)\$'\n";
     print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images). Also\n";
     print STDERR "                          creates much shallower directory structure (useful when creating\n";
 …
              q^rename_assoc_files^, \$self->{'rename_assoc_files'},
              "allow_extra_options")) {
+    print STDERR "\nIncorrect options passed to HTMLPlug, check your collect.cfg configuration file\n";
     &print_usage();
     die "\n";
 …
     $$textref =~ s/(<img[^>]*?src\s*=\s*\"?)([^\">\s]+)(\"?[^>]*>)/
     $self->replace_images ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
+    # add text to document object
+    $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
+    return 1;
+}

trunk/gsdl/perllib/plugins/IndexPlug.pm

-              r809
+              r1244
 use plugin;
 use BasPlug;
-use lang;
 use doc;
 use util;
 …
+}
+use strict;
 sub new {
     my ($class) = @_;
     $self = new BasPlug ();
+    my $self = new BasPlug ("IndexPlug", @_);
     return bless $self, $class;
 …
     return 1;
+}
 # return number of files processed, undef if can't process
 …
     # process each document
     my $count = 0;
     foreach $docfile (keys (%$list)) {
+    foreach my $docfile (keys (%$list)) {
     last if ($maxdocs != -1 && $count >= $maxdocs);
     $metadata = {}; # at present we can do this as metadata
 …
     # note that $list->{$docfile} is an array reference
     if ($docfile !~ /key:/i) {
+        my $i = 0;
         for ($i = 0; $i < scalar (@{$list->{$docfile}}); $i ++) {
         if ($list->{$docfile}->[$i] =~ /^<([^>]+)>(.+)$/) {

trunk/gsdl/perllib/plugins/RecPlug.pm

-              r809
+              r1244
+}
+use strict;
 sub new {
     my ($class) = @_;
     my $self = new BasPlug ();
+    my $self = new BasPlug ("RecPlug", @_);
     $self->{'exclude_tail_dirs'} = []; # empty by default
 …
     my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
     foreach $etd ( @{$self->{'exclude_tail_dirs'}} )
+    foreach my $etd ( @{$self->{'exclude_tail_dirs'}} )
+    {
     return 0 if ($file =~ m/$etd/);
 …
     # see if this is a directory
     $dirname = &util::filename_cat ($base_dir, $file);
+    my $dirname = &util::filename_cat ($base_dir, $file);
     if (-d $dirname) {

trunk/gsdl/perllib/plugins/TEXTPlug.pm

-              r732
+              r1244
 ###########################################################################
+# creates simple single-level document from .txt or .text files
+# (case-insensitive match on filenames). Adds Title metadata
+# of first 100 characters found.
+# creates simple single-level document. Adds Title metadata
+# of first line of text (up to 100 characters long).
 package TEXTPlug;
 use BasPlug;
-use sorttools;
 sub BEGIN {
 …
+}
+use strict;
 sub new {
     my ($class) = @_;
     $self = new BasPlug ();
+    my $self = new BasPlug ("TEXTPlug", @_);
     return bless $self, $class;
+}
 sub is_recursive {
+sub get_default_process_exp {
     my $self = shift (@_);
     return 0; # this is not a recursive plugin
+    return q^(?i)\.te?xt$^;
+}
+# return number of files processed, undef if can't process
+# Note that $base_dir might be "" and that $file might
+# include directories
+sub read {
+# do plugin specific processing of doc_obj
+sub process {
     my $self = shift (@_);
+    my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_;
+    my $filename = &util::filename_cat($base_dir, $file);
+    return undef unless ($filename =~ /\.(te?xt(\.gz)?)$/i && (-e $filename));
+    my $gz = 0;
+    if (defined $2) {
+    $gz = $2;
+    $gz = 1 if ($gz =~ /\.gz/i);
+    }
+    print STDERR "TEXTPlug: processing $filename\n" if $processor->{'verbosity'};
+    # create a new document
+    my $doc_obj = new doc ($file, "indexed_doc");
+    if ($gz) {
+    open (FILE, "zcat $filename |") || die "TEXTPlug::read - zcat can't open $filename\n";
+    } else {
+    open (FILE, $filename) || die "TEXTPlug::read - can't open $filename\n";
+    }
+    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
+    print STDERR "TEXTPlug: processing $file\n"
+    if $self->{'verbosity'} > 1;
     my $cursection = $doc_obj->get_top_section();
+    my $text = "";
+    my $line = "";
+    my $foundtitle = 0;
+    # don't need to get title if it has been passed
+    # in from another plugin
+    if (defined $metadata->{'Title'}) {
+    $foundtitle = 1;
+    }
+    while (defined ($line = <FILE>)) {
+    # use first line as title (or first 100 characters if it's long)
+    if (!$foundtitle && length($line) > 5) {
+        my $title = "";
+        if (length($line) > 100) {
+        $title = substr ($line, 0, 100);
+        } else {
+        $title = $line;
+        }
+        $doc_obj->add_metadata ($cursection, "Title", $title);
+        $foundtitle = 1;
+    # get title metadata
+    # (don't need to get title if it has been passed
+    # in from another plugin)
+    if (!defined $metadata->{'Title'}) {
+    my ($title) = $$textref =~ /^([^\n]*)/;
+    if (length($title) > 100) {
+        $title = substr ($title, 0, 100);
+    }
     $text .= $line;
+    $doc_obj->add_utf8_metadata ($cursection, "Title", $title);
+    }
+    $doc_obj->add_text ($cursection, "<pre>\n$text\n</pre>");
+    # insert preformat tags and add text to document object
+    $doc_obj->add_utf8_text($cursection, "<pre>\n$$textref\n</pre>");
+    foreach $field (keys(%$metadata)) {
+    # $metadata->{$field} may be an array reference
+    if (ref ($metadata->{$field}) eq "ARRAY") {
+        map {
+        $doc_obj->add_metadata ($cursection, $field, $_);
+        } @{$metadata->{$field}};
+    } else {
+        $doc_obj->add_metadata ($cursection, $field, $metadata->{$field});
+    }
+    }
+    # add OID
+    $doc_obj->set_OID ();
+    # process the document
+    $processor->process($doc_obj);
+    return 1; # processed the file
+    return 1;
+}

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: