Changeset 4744

trunk/gsdl/perllib/plugins/ArcPlug.pm

-              r3540
+              r4744
+}
+my $options =
+{   'name'     => "ArcPlug",
+    'desc'     => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.",
+    'inherits' => "Yes" };
+my $options = { 'name'     => "ArcPlug",
+        'desc'     => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.",
+        'inherits' => "Yes" };
 sub new {

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r3834
+              r4744
 my $unicode_list =
 [ { 'name' => "auto",
     'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
   { 'name' => "ascii",
     'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
   { 'name' => "utf8",
     'desc' => "either utf8 or unicode -- automatically detected." },
   { 'name' => "unicode",
     'desc' => "just unicode" } ];
+    [ { 'name' => "auto",
+    'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
+      { 'name' => "ascii",
+    'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
+      { 'name' => "utf8",
+    'desc' => "either utf8 or unicode -- automatically detected." },
+      { 'name' => "unicode",
+    'desc' => "just unicode" } ];
 my $arguments =
     [ { 'name' => "process_exp",
     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     'type' => "string",
     'deft' => "",
 …
     'reqd' => "no" },
       { 'name' => "input_encoding",
     'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",
+    'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.",
     'type' => "enum",
     'list' => $unicode_list,
 …
     'deft' => "auto" } ,
       { 'name' => "default_encoding",
+    'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.  The default is iso_8859_1.",
+    'type' => "flag",
+    'reqd' => "no" },
+    'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.",
+    'type' => "enum",
+    'reqd' => "no",
+        'deft' => "utf8" },
       { 'name' => "extract_language",
     'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
 …
     'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
     'type' => "int",
+    'deft' => (localtime)[5]+1900,
     'reqd' => "no"},
       { 'name' => "maximum_century",
     'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
     'type' => "int",
+    'deft' => "-1",
     'reqd' => "no" },
       { 'name' => "no_bibliography",
 …
+}
+# sub print_usage_new
+# {
+# }
+sub print_usage_new
+{
+    local $self = shift(@_);
+    local $optionlist = $self->{'option_list'};
+    local $pluginoptions = pop(@$optionlist);
+    return if (!defined($pluginoptions));
+    local $pluginname = $pluginoptions->{'name'};
+    local $pluginargs = $pluginoptions->{'args'};
+    # Produce the usage information using the data structure above
+    print STDERR " usage: plugin $pluginname";
+    if (defined($pluginargs)) {
+    print STDERR " [options]";
+    }
+    print STDERR "\n\n";
+    # Display the plugin options, if there are some
+    if (defined($pluginargs)) {
+    # Find the length of the longest option string
+    local $maxlength = 0;
+    foreach $option (@$pluginargs) {
+        local $optionname = $option->{'name'};
+        local $optiontype = $option->{'type'};
+        local $optionstringlength = length($optionname);
+        if ($optiontype ne "flag") {
+        $optionstringlength = $optionstringlength + 3 + length($optiontype);
+        }
+        # Remember the longest
+        if ($optionstringlength > $maxlength) {
+        $maxlength = $optionstringlength;
+        }
+    }
+    # Calculate the column offset of the option descriptions
+    local $optiondescoffset = 3 + $maxlength + 2;
+    # Display the plugin options
+    print STDERR " options:\n";
+    foreach $option (@$pluginargs) {
+        # Display option name
+        local $optionname = $option->{'name'};
+        print STDERR "  -$optionname";
+        local $optionstringlength = 3 + length($optionname);
+        # Display option type, if the option is not a flag
+        local $optiontype = $option->{'type'};
+        if ($optiontype ne "flag") {
+        print STDERR " <$optiontype>";
+        $optionstringlength = $optionstringlength + (2 + length($optiontype) + 1);
+        }
+        # Display the option description
+        local $optiondesc = $option->{'desc'};
+        &display_text_in_column($optiondesc, $optiondescoffset, $optionstringlength, 80);
+        # Show the default value for the option, if there is one
+        local $optiondefault = $option->{'deft'};
+        if (defined($optiondefault)) {
+        print STDERR " " x $optiondescoffset;
+        print STDERR "Default: " . $optiondefault . "\n";
+        }
+        # If the option has a list of possible values, display these
+        local $optionvalueslist = $option->{'list'};
+        if (defined($optionvalueslist)) {
+        print STDERR "\n";
+        foreach $optionvalue (@$optionvalueslist) {
+            local $optionvaluename = $optionvalue->{'name'};
+            print STDERR " " x $optiondescoffset;
+            print STDERR "$optionvaluename:";
+            local $optionvaluedesc = $optionvalue->{'desc'};
+            &display_text_in_column($optionvaluedesc, ($optiondescoffset + 2),
+                        $optiondescoffset + length($optionvaluename), 80);
+        }
+        }
+        # Special case for 'input_encoding'
+        if ($optionname =~ m/^input_encoding$/i) {
+        my $e = $encodings::encodings;
+        foreach $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
+            local $encodingname = $enc;
+            print STDERR " " x $optiondescoffset;
+            print STDERR "$enc:";
+            local $encodingdesc = $e->{$enc}->{'name'};
+            &display_text_in_column($encodingdesc, ($optiondescoffset + 2),
+                        $optiondescoffset + length($encodingname), 80);
+        }
+        }
+        # Add a blank line to separate options
+        print STDERR "\n";
+    }
+    }
+    # If the plugin inherits from another, do the parent now
+    if (defined($optionlist)) {
+    $self->print_usage_new();
+    }
+}
+sub display_text_in_column
+{
+    local ($text, $columnbeg, $firstlineoffset, $columnend) = @_;
+    # Spaces are put *before* words, so treat the column beginning as 1 smaller than it is
+    $columnbeg = $columnbeg - 1;
+    # Add some padding (if needed) for the first line
+    local $linelength = $columnbeg;
+    if ($firstlineoffset < $columnbeg) {
+    print STDERR " " x ($columnbeg - $firstlineoffset);
+    }
+    else {
+    $linelength = $firstlineoffset;
+    }
+    # Break the text into words, and display one at a time
+    local @words = split(/ /, $text);
+    foreach $word (@words) {
+    # Unescape '<' and '>' characters
+    $word =~ s/&lt;/</g;
+    $word =~ s/&gt;/>/g;
+    # If printing this word would except the column end, start a new line
+    if (($linelength + length($word)) >= $columnend) {
+        print STDERR "\n";
+        print STDERR " " x $columnbeg;
+        $linelength = $columnbeg;
+    }
+    # Write the word
+    print STDERR " " . $word;
+    $linelength = $linelength + (length($word) + 1);
+    }
+    print STDERR "\n";
+}
 sub print_general_usage {
     my ($plugin_name) = @_;
 …
              q^process_exp/.*/^, \$self->{'process_exp'},
              q^block_exp/.*/^, \$self->{'block_exp'},
+             q^extract_language^, \$self->{'extract_language'},
              q^extract_acronyms^, \$self->{'extract_acronyms'},
              q^extract_keyphrases^, \$self->{'kea'}, #with extra options
              q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
+             q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
+             q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
              qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
              qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},

trunk/gsdl/perllib/plugins/BibTexPlug.pm

-              r3587
+              r4744
 my $arguments =
 [ {     'name' => "process_exp",
     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     'type' => "string",
     'reqd' => "no" ,
+    'deft' => q^(?i)\.bib$^ }
+];
+my $options =
+{   'name'     => "BibTexPlug",
+    'desc'     => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.",
+    'inherits' => "Yes",
+    'args'     => $arguments };
+    'deft' => q^(?i)\.bib$^ } ];
+my $options = { 'name'     => "BibTexPlug",
+        'desc'     => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.",
+        'inherits' => "Yes",
+        'args'     => $arguments };
 # This plugin processes files with the suffix ".bib"

trunk/gsdl/perllib/plugins/BookPlug.pm

-              r3540
+              r4744
 my $arguments =
 [ {     'name' => "process_exp",
     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     'type' => "string",
     'reqd' => "no",
     'deft' => q^(?i)\.hb$^} ,
 {   'name' => "block_exp",
+    'deft' => &get_default_process_exp() },
+      { 'name' => "block_exp",
     'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     'type' => "string",
     'reqd' => "no",
+    'deft' => q^\.jpg$^}
+];
+my $options =
+{   'name'     => "BookPlug",
+    'desc'     => "Creates multi-level document from document containing &lt;&lt;TOC&gt;&gt; level tags. Metadata for each section is taken from any other tags on the same line as the &lt;&lt;TOC&gt;&gt;. e.g. &lt;&lt;Title&gt;&gt;xxxx&lt;&lt;/Title&gt;&gt; sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around &lt;&lt;TOC&gt;&gt; tags, uses &lt;&lt;I&gt;&gt; tags to specify images, and simply takes all text between &lt;&lt;TOC&gt;&gt; tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
+    'inherits' => "Yes",
+    'args'     => $arguments };
+    'deft' => &get_default_block_exp() } ];
+my $options = { 'name'     => "BookPlug",
+        'desc'     => "Creates multi-level document from document containing &lt;&lt;TOC&gt;&gt; level tags. Metadata for each section is taken from any other tags on the same line as the &lt;&lt;TOC&gt;&gt;. e.g. &lt;&lt;Title&gt;&gt;xxxx&lt;&lt;/Title&gt;&gt; sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around &lt;&lt;TOC&gt;&gt; tags, uses &lt;&lt;I&gt;&gt; tags to specify images, and simply takes all text between &lt;&lt;TOC&gt;&gt; tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
+        'inherits' => "Yes",
+        'args'     => $arguments };
 sub new {

trunk/gsdl/perllib/plugins/ConvertToPlug.pm

-              r3720
+              r4744
 my $convert_to_list =
+[ { 'name' => "html",
+    'desc' => "" },
+{   'name' => "text",
+    'desc' => "" }
+];
+    [ { 'name' => "html",
+    'desc' => "HTML format" },
+      { 'name' => "text",
+    'desc' => "Plain text format" } ];
 my $arguments =
 [ {     'name' => "convert_to",
     'desc' => "Plugin converts to TEXT or HTML (default html).",
+    [ { 'name' => "convert_to",
+    'desc' => "Plugin converts to TEXT or HTML.",
     'type' => "enum",
     'reqd' => "no",
     'list' => $convert_to_list,
+    'deft' => "html"}
+];
+my $options =
+{  'name'     => "ConvertToPlug",
+   'desc'     => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'.  If the argument is not present, the default is to inherit HTMLPlug.",
+   'inherits' => "Yes",
+   'args'     => $arguments };
+    'deft' => "html" },
+      { 'name' => "use_strings",
+    'desc' => "If set, a simple strings function will be called to extract text if the conversion utility fails.",
+    'type' => "flag",
+    'reqd' => "no" } ];
+my $options = { 'name'     => "ConvertToPlug",
+        'desc'     => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'.  If the argument is not present, the default is to inherit HTMLPlug.",
+        'inherits' => "Yes",
+        'args'     => $arguments };
 …
     if (!parsargv::parse($args,
              q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
              q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
+             q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented)
+             q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented)
              q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
              q^use_strings^, \$newargs->{'use_strings'},
 …
 sub new {
     my $class = shift (@_);
+    if ($class eq "ConvertToPlug") {$class = shift (@_);}
+    # print "Class: " . $class . "\n";
+    # if ($class eq "ConvertToPlug") {$class = shift (@_);}
     my $self;
     # parsargv::parse might modify the list, so we do this by creating a copy

trunk/gsdl/perllib/plugins/ConvertToRogPlug.pm

-              r3737
+              r4744
     @ISA = ('RogPlug');
+}
+my $options = { 'name'     => "ConvertToRogPlug",
+        'desc'     => "A plugin that inherits from RogPlug.",
+        'inherits' => "Yes" };
 sub print_usage {
 …
 sub new {
     my $class = shift (@_);
+    if ($class eq "ConvertToRogPlug") {$class = shift (@_);}
+    # print "Class: " . $class . "\n";
+    # if ($class eq "ConvertToRogPlug") {$class = shift (@_);}
     my $self;
     # parsargv::parse might modify the list, so we do this by creating a copy
 …
     $self->{'convert_to'} = "Rog";
     $self->{'convert_to_ext'} = "rog";
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     return bless $self, $class;

trunk/gsdl/perllib/plugins/DBPlug.pm

-              r4429
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+                          'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+                          'type' => "string",
+                          'deft' => q^(?i)\.dbi$^,
+                          'reqd' => "no" } ,
+                        { 'name' => "title_sub",
+                          'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
+                          'type' => "string",
+                          'reqd' => "no" }];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ,
+      { 'name' => "title_sub",
+    'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" }];
 my $options = { 'name'     => "DBPlug",
                  'desc'     => "Uses records from a database as documents.",
                      'inherits' => "yes",
                      'args'     => $arguments };
+        'desc'     => "Uses records from a database as documents.",
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub print_usage {

trunk/gsdl/perllib/plugins/EMAILPlug.pm

-              r4224
+              r4744
 my $arguments =
 [ {     'name' => "process_exp",
     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     'type' => "string",
     'reqd' => "no",
+    'deft' => q@([\\/]\d+|\.(mbx|email|eml))$@
+  },
+  {     'name' => "no_attachments",
+    'deft' => &get_default_process_exp() },
+      { 'name' => "no_attachments",
     'desc' => "Do not save message attachments.",
     'type' => "flag",
+    'reqd' => "no"
+  },
+  {     'name' => "block_exp",
+    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+    'reqd' => "no" },
+      { 'name' => "split_exp",
+    'desc' => "A perl regular expression used to split files containing many messages into individual documents.",
     'type' => "string",
+    'reqd' => "no",
+    'deft' => q^^}
+];
+my $options =
+{   'name'     => "EMAILPlug",
+    'desc'     => "Email plug reads email files.  These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers      All the header content\n\t\$Subject      Subject: header\n\t\$To           To: header\n\t\$From         From: header\n\t\$FromName     Name of sender (where available)\n\t\$FromAddr     E-mail address of sender\n\t\$DateText     Date: header\n\t\$Date         Date: header in GSDL format (eg: 19990924)",
+    'inherits' => "Yes",
+    'args'     => $arguments };
+    'deft' => "" } ];
+my $options = { 'name'     => "EMAILPlug",
+        'desc'     => "Email plug reads email files.  These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers      All the header content\n\t\$Subject      Subject: header\n\t\$To           To: header\n\t\$From         From: header\n\t\$FromName     Name of sender (where available)\n\t\$FromAddr     E-mail address of sender\n\t\$DateText     Date: header\n\t\$Date         Date: header in GSDL format (eg: 19990924)",
+        'inherits' => "Yes",
+        'args'     => $arguments };
 # Create a new EMAILPlug object with which to parse a file.

trunk/gsdl/perllib/plugins/ExcelPlug.pm

-              r2990
+              r4744
+}
+my $options = { 'name'     => "ExcelPlug",
+        'desc'     => "A plugin for importing Microsoft Excel files.",
+        'inherits' => "Yes" };
 sub new {
     my $class = shift (@_);
 …
 #   $self->{'input_encoding'} = "utf8";
 #    }
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     return bless $self, $class;

trunk/gsdl/perllib/plugins/FOXPlug.pm

r3540	r4744
38	38	use unicode;
39	39	use cnseg;
40		use gb;
	40	# use gb;
41	41
42	42

trunk/gsdl/perllib/plugins/HBPlug.pm

r3542	r4744
73	73
74	74	$self->BasPlug::init($verbosity, $outhandle);
	75	$self->{'input_encoding'} = "iso_8859_1";
75	76
76	77	# this plugin only handles ascii encodings

trunk/gsdl/perllib/plugins/HTMLPlug.pm

-              r3708
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' =>  q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php|\.cgi|.+\?.+=.*)$^ },
+          { 'name' => "block_exp",
+            'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+            'type' => 'string',
+            'deft' =>  q^(?i)\.(gif|jpe?g|png|css)$^ },
+          { 'name' => "nolinks",
+            'desc' =>  "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
+            'type' => "flag" },
+          { 'name' => "keep_head",
+            'desc' => "Don't remove headers from html files.",
+            'type' => "flag" },
+          { 'name' => "no_metadata",
+            'desc' => "Don't attempt to extract any metadata from files.",
+            'type' => "flag" },
+          { 'name' => "metadata_fields",
+            'desc' => "Comma separated list of metadata fields to attempt to extract. Defaults to 'Title'. Use 'tag&lt;tagname&gt;' to have the contents of the first &lt;tagname &gt; pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
+            'type' => "metadatum",
+            'deft' => "" },
+          { 'name' => "hunt_creator_metadata",
+            'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
+            'type' => "flag" },
+          { 'name' => "file_is_url",
+            'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
+            'type' => "flag" },
+          { 'name' => "assoc_files",
+            'desc' => "Perl regular expression of file extensions to associate with html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'",
+            'type' => "string",
+            'deft' => q^(?i)\.(jpe?g|gif|png|css)\$^ },
+          { 'name' => "rename_assoc_files",
+            'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
+            'type' => "flag" } ,
+          { 'name' => "title_sub",
+            'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
+            'type' => "string" } ,
+          { 'name' => "description_tags",
+            'desc' => "Split document into sub-sections where &lt;Section&gt; tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the &lt;Section&gt; tags. Also, '-keep_head' will have no effect when this option is set.",
+            'type' => "flag" } ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin.  For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' =>  &get_default_process_exp() },
+      { 'name' => "block_exp",
+    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+    'type' => 'string',
+    'deft' =>  &get_default_block_exp() },
+      { 'name' => "nolinks",
+    'desc' =>  "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
+    'type' => "flag" },
+      { 'name' => "keep_head",
+    'desc' => "Don't remove headers from html files.",
+    'type' => "flag" },
+      { 'name' => "no_metadata",
+    'desc' => "Don't attempt to extract any metadata from files.",
+    'type' => "flag" },
+      { 'name' => "metadata_fields",
+    'desc' => "Comma separated list of metadata fields to attempt to extract. Use 'tag&lt;tagname&gt;' to have the contents of the first &lt;tagname&gt; pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
+    'type' => "metadatum",
+    'deft' => "Title" },
+      { 'name' => "hunt_creator_metadata",
+    'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
+    'type' => "flag" },
+      { 'name' => "file_is_url",
+    'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
+    'type' => "flag" },
+      { 'name' => "assoc_files",
+    'desc' => "Perl regular expression of file extensions to associate with html documents.",
+    'type' => "string",
+    'deft' => q^(?i)\.(jpe?g|gif|png|css)$^ },
+      { 'name' => "rename_assoc_files",
+    'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
+    'type' => "flag" },
+      { 'name' => "title_sub",
+    'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
+    'type' => "string",
+    'deft' => "" },
+      { 'name' => "description_tags",
+    'desc' => "Split document into sub-sections where &lt;Section&gt; tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the &lt;Section&gt; tags. Also, '-keep_head' will have no effect when this option is set.",
+    'type' => "flag" } ];
 my $options = { 'name'     => "HTMLPlug",
 …
         'args'     => $arguments };
+sub print_usage {
+    print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
+    print STDERR "  options:\n";
+    print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
+    print STDERR "                          flag may improve speed of building/importing but\n";
+    print STDERR "                          any relative links within documents will be broken).\n";
+    print STDERR "   -keep_head             Don't remove headers from html files.\n";
+    print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
+    print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
+                          extract. Defaults to 'Title'.
+                          Use 'tag<tagname>' to have the contents of the first
+                          <tagname> pair put in a metadata element called
+                          'tagname'. Capitalise this as you want the metadata
+                          capitalised in Greenstone, since the tag extraction
+                          is case insensitive.\n";
+    print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
+                          place it in the 'Creator' field. Requires the
+                          -metadata_fields flag.\n";
+    print STDERR "   -file_is_url           Set if input filenames make up url of original source
+                          documents e.g. if a web mirroring tool was used to
+                          create the import directory structure\n";
+    print STDERR "   -assoc_files           Perl regular expression of file extensions to
+                          associate with html documents.
+                          Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
+    print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
+                          Also creates much shallower directory structure
+                          (useful when creating collections to go on cd-rom).\n";
+    print STDERR "   -title_sub             Substitution expression to modify string stored as
+                          Title. Used by, for example, PDFPlug to remove
+                          \"Page 1\", etc from text used as the title.\n";
+    print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
+                          occur. Note that by setting this option you
+                          implicitly set -no_metadata, as all metadata should
+                          be included within the <Section> tags (this is only
+                          true for documents that actually contain <Section> tags
+                          however). Also, '-keep_head' will have no effect when
+                          this option is set, regardless of whether a document
+                          contains Section tags.\n";
+}
+#  sub print_usage {
+#      print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
+#      print STDERR "  options:\n";
+#      print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
+#      print STDERR "                          flag may improve speed of building/importing but\n";
+#      print STDERR "                          any relative links within documents will be broken).\n";
+#      print STDERR "   -keep_head             Don't remove headers from html files.\n";
+#      print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
+#      print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
+#                            extract. Defaults to 'Title'.
+#                            Use 'tag<tagname>' to have the contents of the first
+#                            <tagname> pair put in a metadata element called
+#                            'tagname'. Capitalise this as you want the metadata
+#                            capitalised in Greenstone, since the tag extraction
+#                            is case insensitive.\n";
+#      print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
+#                            place it in the 'Creator' field. Requires the
+#                            -metadata_fields flag.\n";
+#      print STDERR "   -file_is_url           Set if input filenames make up url of original source
+#                            documents e.g. if a web mirroring tool was used to
+#                            create the import directory structure\n";
+#      print STDERR "   -assoc_files           Perl regular expression of file extensions to
+#                            associate with html documents.
+#                            Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
+#      print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
+#                            Also creates much shallower directory structure
+#                            (useful when creating collections to go on cd-rom).\n";
+#      print STDERR "   -title_sub             Substitution expression to modify string stored as
+#                            Title. Used by, for example, PDFPlug to remove
+#                            \"Page 1\", etc from text used as the title.\n";
+#      print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
+#                            occur. Note that by setting this option you
+#                            implicitly set -no_metadata, as all metadata should
+#                            be included within the <Section> tags (this is only
+#                            true for documents that actually contain <Section> tags
+#                            however). Also, '-keep_head' will have no effect when
+#                            this option is set, regardless of whether a document
+#                            contains Section tags.\n";
+#  }
 sub new {

trunk/gsdl/perllib/plugins/ImagePlug.pm

-              r4724
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' => q^(?i)(\.jpe?g|\.gif|\.png|\.bmp|\.xbm|\.tif?f)$^,
+            'reqd' => "no" },
+          { 'name' => "noscaleup",
+            'desc' => "Don't scale up small images when making thumbnails.",
+            'type' => "flag",
+            'reqd' => "no" },
+          { 'name' => "thumbnailsize",
+            'desc' => "Make thumbnails of size nxn.",
+            'type' => "int",
+            'reqd' => "no" },
+          { 'name' => "thumbnailtype",
+            'desc' => "Make thumbnails in format 's'.",
+            'type' => "string",
+            'reqd' => "no" },
+          { 'name' => "screenviewsize",
+            'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata.  By default it is not set.",
+            'type' => "int",
+            'reqd' => "no" },
+          { 'name' => "screenviewtype",
+            'desc' => "If -screenviewsize is set, this sets the screen display image type.  Defaults to jpg.",
+            'type' => "string",
+            'deft' => "jpg",
+            'reqd' => "no" },
+          { 'name' => "convertto",
+            'desc' => "Convert main image to.",
+            'type' => "string",
+            'reqd' => "no" },
+          { 'name' => "minimumsize",
+            'desc' => "Ignore images smaller than n bytes.",
+            'type' => "int",
+            'reqd' => "no" } ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" },
+      { 'name' => "noscaleup",
+    'desc' => "Don't scale up small images when making thumbnails.",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "thumbnailsize",
+    'desc' => "Make thumbnails of size nxn.",
+    'type' => "int",
+    'deft' => "100",
+    'reqd' => "no" },
+      { 'name' => "thumbnailtype",
+    'desc' => "Make thumbnails in format 's'.",
+    'type' => "string",
+    'deft' => "gif",
+    'reqd' => "no" },
+      { 'name' => "screenviewsize",
+    'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata.  By default it is not set.",
+    'type' => "int",
+    'deft' => "0",
+    'reqd' => "no" },
+      { 'name' => "screenviewtype",
+    'desc' => "If -screenviewsize is set, this sets the screen display image type.",
+    'type' => "string",
+    'deft' => "jpg",
+    'reqd' => "no" },
+      { 'name' => "converttotype",
+    'desc' => "Convert main image to.",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" },
+      { 'name' => "minimumsize",
+    'desc' => "Ignore images smaller than n bytes.",
+    'type' => "int",
+    'deft' => "100",
+    'reqd' => "no" } ];
 my $options = { 'name'     => "ImagePlug",

trunk/gsdl/perllib/plugins/MACROPlug.pm

-              r3856
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' => q^(?i)\.dm$^,
+            'reqd' => "no" }];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ];
 my $options = { 'name'     => "MACROPlug",
 …
     push( @{$option_list}, $options );
     $self->{'lang_abbr'} = load_language_table();
+    # $self->{'lang_abbr'} = load_language_table();
     return bless $self, $class;

trunk/gsdl/perllib/plugins/MARCPlug.pm

-              r3508
+              r4744
     unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
+}
+my $arguments =
+    [ { 'name' => "metadata_mapping",
+    'desc' => "Name of file that includes mapping details from MARC values to Greenstone metadata names. Defaults to 'marctodc.txt' found in the site's etc directory.",
+    'type' => "string",
+    'deft' => "marctodc.txt",
+    'reqd' => "no" } ];
+my $options = { 'name'     => "MARCPlug",
+        'desc'     => "",
+        'inherits' => "Yes",
+        'args'     => $arguments };
 use MARC::Record;
 …
     $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     return bless $self, $class;

trunk/gsdl/perllib/plugins/PDFPlug.pm

-              r4103
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' => q^(?i)\.pdf$^,
+            'reqd' => "no" },
+          { 'name' => "block_exp",
+            'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+            'type' => 'string',
+            'deft' =>  q^^ },
+          { 'name' => "noimages",
+            'desc' =>  "Don't attempt to extract images from PDF.",
+            'type' => "flag" },
+          { 'name' => "complex",
+            'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
+            'type' => "flag" },
+          { 'name' => "nohidden",
+            'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.",
+            'type' => "flag" },
+          { 'name' => "zoom",
+            'desc' =>  "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).",
+            'type' => "int" }
+          ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" },
+      { 'name' => "block_exp",
+    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+    'type' => "string",
+    'deft' =>  q^^ },
+      { 'name' => "noimages",
+    'desc' => "Don't attempt to extract images from PDF.",
+    'type' => "flag" },
+      { 'name' => "complex",
+    'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
+    'type' => "flag" },
+      { 'name' => "nohidden",
+    'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.",
+    'type' => "flag" },
+      { 'name' => "zoom",
+    'desc' => "The factor by which to zoom the PDF for output (this is only useful if -complex is set).",
+    'deft' => "2",
+    'type' => "int" },
+      { 'name' => "use_sections",
+    'desc' => "Create a separate section for each page of the PDF file.",
+    'type' => "flag" } ];
 my $options = { 'name'     => "PDFPlug",

trunk/gsdl/perllib/plugins/PPTPlug.pm

-              r2981
+              r4744
+}
+my $options = { 'name'     => "PPTPlug",
+        'desc'     => "A plugin for importing Microsoft PowerPoint files.",
+        'inherits' => "Yes" };
 sub new {
     my $class = shift (@_);
 …
     $self->{'input_encoding'} = "utf8";
+    }
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     return bless $self, $class;

trunk/gsdl/perllib/plugins/PSPlug.pm

-              r3540
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' => q^(?i)\.ps$^,
+            'reqd' => "no" },
+          { 'name' => "block_exp",
+            'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+            'type' => 'string',
+            'deft' =>  q^(?i)\.(eps)$^ }
+          ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" },
+      { 'name' => "block_exp",
+    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+    'type' => 'string',
+    'deft' => &get_default_block_exp() },
+      { 'name' => "extract_date",
+    'desc' => "Extract date from PS header.",
+    'type' => "flag" },
+      { 'name' => "extract_pages",
+    'desc' => "Extract pages from PS header.",
+    'type' => "flag" },
+      { 'name' => "extract_title",
+    'desc' => "Extract title from PS header.",
+    'type' => "flag" } ];
 my $options = { 'name'     => "PSPlug",

trunk/gsdl/perllib/plugins/RTFPlug.pm

-              r3540
+              r4744
+}
 my $arguments = [ { 'name' => "process_exp",
             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
             'type' => "string",
             'deft' => q^(?i)\.rtf$^,
             'reqd' => "no" }
           ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ];
 my $options = { 'name'     => "RTFPlug",

trunk/gsdl/perllib/plugins/RecPlug.pm

-              r3540
+              r4744
 use XML::Parser;
+my $arguments = [ { 'name' => "block_exp",
+            'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+            'type' => "string",
+            'deft' => "CVS",
+            'reqd' => "no" },
+          { 'name' => "use_metadata_files",
+            'desc' => "Read metadata from metadata XML files.",
+            'type' => "flag",
+            'reqd' => "no" } ];
+my $arguments =
+    [ { 'name' => "block_exp",
+    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+    'type' => "string",
+    'deft' => &get_default_block_exp(),
+    'reqd' => "no" },
+      { 'name' => "use_metadata_files",
+    'desc' => "Read metadata from metadata XML files.",
+    'type' => "flag",
+    'reqd' => "no" } ];
 my $options = { 'name'     => "RecPlug",
+        'desc'     => "RecPlug is a plugin which recurses through directories processing
+# each file it finds. For detailed comments edit &lt;GSDLHOME&gt;/perllib/plugins/RecPlug.pm .",
+            'inherits' => "yes",
+            'args'     => $arguments };
+        'desc'     => "RecPlug is a plugin which recurses through directories processing each file it finds.",
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub print_usage {

trunk/gsdl/perllib/plugins/ReferPlug.pm

-              r3540
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' => q^(?i)\.bib$^,
+            'reqd' => "no" } ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ];
 my $options = { 'name'     => "ReferPlug",

trunk/gsdl/perllib/plugins/RogPlug.pm

-              r3737
+              r4744
+}
+my $options = { 'name'     => "RogPlug",
+        'desc'     => "Creates simple single-level documents from .rog or .mdb files.",
+        'inherits' => "Yes" };
 sub new {
     my ($class) = @_;
     $self = new BasPlug ();
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     return bless $self, $class;

trunk/gsdl/perllib/plugins/SRCPlug.pm

-              r3919
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+            'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+            'type' => "string",
+            'deft' => q^(Makefile.*|README.*|(?i)\.(c|cc|cpp|C|h|hpp|pl|pm|sh))$^,
+            'reqd' => "no" } ,
+          { 'name' => "block_exp",
+            'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+            'type' => 'string',
+            'deft' => q^(?i)\.(o|obj|a|so|dll)$^,
+            'reqd' => "no" } ,
+          { 'name' => "remove_prefix",
+            'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.",
+            'type' => 'string',
+            'reqd' => "no" } ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ,
+      { 'name' => "block_exp",
+    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
+    'type' => 'string',
+    'deft' => &get_default_block_exp(),
+    'reqd' => "no" } ,
+      { 'name' => "remove_prefix",
+    'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.",
+    'type' => 'string',
+    'deft' => "",
+    'reqd' => "no" } ];
 my $options = { 'name'     => "SRCPlug",
         'desc'     => "Filename is currently used for Title ( optionally minus some prefix ). Current languages:\ntext: READMEs/Makefiles\nC/C++   (currently extracts #include statements and C++ class decls)\nPerl    (currently only done as text)\nShell   (currently only done as text)",
             'inherits' => "yes",
             'args'     => $arguments };
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub print_usage {

trunk/gsdl/perllib/plugins/SplitPlug.pm

-              r3540
+              r4744
+}
+my $arguments =
+    [ { 'name' => "split_exp",
+    'desc' => "A perl regular expression to split input files into segments.",
+    'type' => "string",
+    'deft' => &get_default_split_exp(),
+    'reqd' => "no" }
+      ];
 my $options = { 'name'     => "SplitPlug",
+                 'desc'     => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly.  Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug.  See ReferPlug for an example.",
+                     'inherits' => "yes" };
+        'desc'     => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly.  Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug.  See ReferPlug for an example.",
+        'inherits' => "yes",
+            'args'     => $arguments };

trunk/gsdl/perllib/plugins/TEXTPlug.pm

-              r3932
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+                          'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+                          'type' => "string",
+                          'deft' => q^(?i)\.te?xt$^,
+                          'reqd' => "no" } ,
+                        { 'name' => "title_sub",
+                          'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
+                          'type' => "string",
+                          'reqd' => "no" }];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ,
+      { 'name' => "title_sub",
+    'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" } ];
 my $options = { 'name'     => "TEXTPlug",
                  'desc'     => "Creates simple single-level document. Adds Title metadata of first line of text (up to 100 characters long).",
                      'inherits' => "yes",
                      'args'     => $arguments };
+        'desc'     => "Creates simple single-level document. Adds Title metadata of first line of text (up to 100 characters long).",
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub print_usage {
 …
     my $self = new BasPlug ($class, @_);
      # 14-05-02 To allow for proper inheritance of arguments - John Thompson
      my $option_list = $self->{'option_list'};
      push( @{$option_list}, $options );
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     if (!parsargv::parse(\@_,

trunk/gsdl/perllib/plugins/UnknownPlug.pm

-              r2883
+              r4744
     @ISA = ('BasPlug');
+}
+my $arguments =
+    [ { 'name' => "assoc_field",
+    'desc' => "Name of the metadata field that will hold the associated file's name.",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" } ,
+      { 'name' => "file_type",
+    'desc' => "Mime type of the file (e.g. image/gif)",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" } ];
+my $options = { 'name'     => "UnknownPlug",
+        'desc'     => "This is a simple Plugin for importing files in formats that Greenstone doesn't know anything about.  A fictional document will be created for every such file, and the file itself will be passed to Greenstone as the \"associated file\" of the document.",
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub print_usage {
 …
     my $self = new BasPlug ($class, @_);
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     if (!parsargv::parse(\@_,
              q^assoc_field/.*/^, \$self->{'assoc_field'},

trunk/gsdl/perllib/plugins/W3ImgPlug.pm

-              r2996
+              r4744
     @ISA = qw( HTMLPlug );
+}
+my $aggressiveness_list =
+    [ { 'name' => "1",
+    'desc' => "Filename, path, ALT text only." },
+      { 'name' => "2",
+    'desc' => "All of 1, plus caption where available." },
+      { 'name' => "3",
+    'desc' => "All of 2, plus near paragraphs where available." },
+      { 'name' => "4",
+    'desc' => "All of 3, plus previous headers (<h1>, <h2>...) where available." },
+      { 'name' => "5",
+    'desc' => "All of 4, plus textual references where available." },
+      { 'name' => "6",
+    'desc' => "All of 4, plus page metatags (title, keywords, etc)." },
+      { 'name' => "7",
+    'desc' => "All of 6, 5 and 4 combined." },
+      { 'name' => "8",
+    'desc' => "All of 7, plus repeat caption, filename, etc (raise ranking of more relevant results)." },
+      { 'name' => "9",
+    'desc' => "All of 1, plus full text of source page." } ];
+my $arguments =
+    [ { 'name' => "aggressiveness",
+    'desc' => "Range of related text extraction techniques to use.",
+    'type' => "int",
+    'list' => $aggressiveness_list,
+    'deft' => "3",
+    'reqd' => "no" },
+      { 'name' => "index_pages",
+    'desc' => "Index the pages along with the images. Otherwise reference the pages at the source URL.",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "no_cache_images",
+    'desc' => "Don't cache images (point to URL of original)",
+    'type' => "flag",
+    'reqd' => "no" },
+      { 'name' => "min_size",
+    'desc' => "Bytes. Skip images smaller than this.",
+    'type' => "int",
+    'deft' => "2000",
+    'reqd' => "no" },
+      { 'name' => "min_width",
+    'desc' => "Pixels. Skip images narrower than this.",
+    'type' => "int",
+    'deft' => "50",
+    'reqd' => "no" },
+      { 'name' => "min_height",
+    'desc' => "Pixels. Skip images shorter than this.",
+    'type' => "int",
+    'deft' => "50",
+    'reqd' => "no" },
+      { 'name' => "thumb_size",
+    'desc' => "Max thumbnail size. Both width and height.",
+    'type' => "int",
+    'deft' => "100",
+    'reqd' => "no" },
+      { 'name' => "convert_params",
+    'desc' => "Additional parameters for ImageMagicK convert on thumbnail creation. For example, '-raise' will give a three dimensional effect to thumbnail images.",
+    'type' => "string",
+    'deft' => "",
+    'reqd' => "no" },
+      { 'name' => "min_near_text",
+    'desc' => "Minimum characters of near text or caption to extract.",
+    'type' => "int",
+    'deft' => "10",
+    'reqd' => "no" },
+      { 'name' => "max_near_text",
+    'desc' => "Maximum characters near images to extract.",
+    'type' => "int",
+    'deft' => "400",
+    'reqd' => "no" },
+      { 'name' => "smallpage_threshold",
+    'desc' => "Images on pages smaller than this (bytes) will have the page (title, keywords, etc) meta-data added.",
+    'type' => "int",
+    'deft' => "2048",
+    'reqd' => "no" },
+      { 'name' => "textrefs_threshold",
+    'desc' => "Threshold for textual references. Lower values mean the algorithm is less strict.",
+    'type' => "int",
+    'deft' => "2",
+    'reqd' => "no" },
+      { 'name' => "caption_length",
+    'desc' => "Maximum length of captions (in characters).",
+    'type' => "int",
+    'deft' => "80",
+    'reqd' => "no" },
+      { 'name' => "neartext_length",
+    'desc' => "Target length of near text (in characters).",
+    'type' => "int",
+    'deft' => "300",
+    'reqd' => "no" },
+      { 'name' => "document_text",
+    'desc' => "Add image text as document:text (otherwise IndexedText metadata field).",
+    'type' => "flag",
+    'reqd' => "no" }
+      ];
+my $options = { 'name'     => "W3ImgPlug",
+        'desc'     => "",
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub print_usage {
 …
     my $self = new HTMLPlug ($class, @_);
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     if (!parsargv::parse(\@_,
              q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
 …
     # etc/W3ImgPlug.cfg (XML)
     # tag sets for captions and neartext
     if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 10 ) {
+    if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
     $self->{'delims'} = [];
     $self->{'cdelims'} = [];
 …
     # get stop words for textual reference extraction
     # TODO: warnings scroll off. Would be best to output them again at end of import
     if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 10 ) {
+    if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
     $self->{'stopwords'} = ();
     $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");

trunk/gsdl/perllib/plugins/WordPlug.pm

-              r3540
+              r4744
+}
+my $arguments = [ { 'name' => "process_exp",
+                          'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+                          'type' => "string",
+                          'deft' => q^(?i)\.doc$^,
+                          'reqd' => "no" } ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ];
 my $options = { 'name'     => "WordPlug",
                  'desc'     => "",
                      'inherits' => "yes",
                      'args'     => $arguments };
+        'desc'     => "A plugin for importing Microsoft Word documents.",
+        'inherits' => "yes",
+        'args'     => $arguments };
 sub new {
 …
     my $self = new ConvertToPlug ($class, @_);
      # 14-05-02 To allow for proper inheritance of arguments - John Thompson
      my $option_list = $self->{'option_list'};
      push( @{$option_list}, $options );
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     # wvWare will always produce html files encoded as utf-8

trunk/gsdl/perllib/plugins/XMLPlug.pm

-              r3540
+              r4744
 use XML::Parser;
+my $arguments = [ { 'name' => "process_exp",
+                          'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
+                          'type' => "string",
+                          'deft' => q^(?i)\.xml$^,
+                          'reqd' => "no" } ];
+my $arguments =
+    [ { 'name' => "process_exp",
+    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
+    'type' => "string",
+    'deft' => &get_default_process_exp(),
+    'reqd' => "no" } ];
 my $options = { 'name'     => "XMLPlug",
                  'desc'     => "",
                      'inherits' => "yes",
                      'args'     => $arguments };
+        'desc'     => "Base class for XML plugins.",
+        'inherits' => "yes",
+        'args'     => $arguments };

trunk/gsdl/perllib/plugins/ZIPPlug.pm

-              r3540
+              r4744
 my $options = { 'name'     => "ZIPPlug",
                  'desc'     => "Plugin which handles compressed and/or archived input formats currently handled formats and file extensions are:\ngzip (.gz, .z, .tgz, .taz)\nbzip (.bz)\nbzip2 (.bz2)\nzip (.zip .jar)\ntar (.tar)\n\nThis plugin relies on the following utilities being present (if trying to process the corresponding formats):\ngunzip (for gzip)\nbunzip (for bzip)\nbunzip2 \nunzip (for zip)\ntar (for tar)",
                      'inherits' => "yes" };
+        'desc'     => "Plugin which handles compressed and/or archived input formats currently handled formats and file extensions are:\ngzip (.gz, .z, .tgz, .taz)\nbzip (.bz)\nbzip2 (.bz2)\nzip (.zip .jar)\ntar (.tar)\n\nThis plugin relies on the following utilities being present (if trying to process the corresponding formats):\ngunzip (for gzip)\nbunzip (for bzip)\nbunzip2 \nunzip (for zip)\ntar (for tar)",
+        'inherits' => "yes" };
 sub new {
 …
     my $self = new BasPlug ("ZIPPlug", @_);
      # 14-05-02 To allow for proper inheritance of arguments - John Thompson
      my $option_list = $self->{'option_list'};
      push( @{$option_list}, $options );
+    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
+    my $option_list = $self->{'option_list'};
+    push( @{$option_list}, $options );
     return bless $self, $class;

Context Navigation

Legend:

trunk/gsdl/perllib/plugins/ArcPlug.pm

trunk/gsdl/perllib/plugins/BasPlug.pm

trunk/gsdl/perllib/plugins/BibTexPlug.pm

trunk/gsdl/perllib/plugins/BookPlug.pm

trunk/gsdl/perllib/plugins/ConvertToPlug.pm

trunk/gsdl/perllib/plugins/ConvertToRogPlug.pm

trunk/gsdl/perllib/plugins/DBPlug.pm

trunk/gsdl/perllib/plugins/EMAILPlug.pm

trunk/gsdl/perllib/plugins/ExcelPlug.pm

trunk/gsdl/perllib/plugins/FOXPlug.pm

trunk/gsdl/perllib/plugins/HBPlug.pm

trunk/gsdl/perllib/plugins/HTMLPlug.pm

trunk/gsdl/perllib/plugins/ImagePlug.pm

trunk/gsdl/perllib/plugins/MACROPlug.pm

trunk/gsdl/perllib/plugins/MARCPlug.pm

trunk/gsdl/perllib/plugins/PDFPlug.pm

trunk/gsdl/perllib/plugins/PPTPlug.pm

trunk/gsdl/perllib/plugins/PSPlug.pm

trunk/gsdl/perllib/plugins/RTFPlug.pm

trunk/gsdl/perllib/plugins/RecPlug.pm

trunk/gsdl/perllib/plugins/ReferPlug.pm

trunk/gsdl/perllib/plugins/RogPlug.pm

trunk/gsdl/perllib/plugins/SRCPlug.pm

trunk/gsdl/perllib/plugins/SplitPlug.pm

trunk/gsdl/perllib/plugins/TEXTPlug.pm

trunk/gsdl/perllib/plugins/UnknownPlug.pm

trunk/gsdl/perllib/plugins/W3ImgPlug.pm

trunk/gsdl/perllib/plugins/WordPlug.pm

trunk/gsdl/perllib/plugins/XMLPlug.pm

trunk/gsdl/perllib/plugins/ZIPPlug.pm

Download in other formats: