Changeset 4744


Ignore:
Timestamp:
2003-06-20T14:22:34+12:00 (21 years ago)
Author:
mdewsnip
Message:

Tidied up and structures (representing the options of the plugin) in preparation for removing the print_usage() routines.

Location:
trunk/gsdl/perllib/plugins
Files:
30 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/ArcPlug.pm

    r3540 r4744  
    4141}
    4242
    43 my $options =
    44 {   'name'     => "ArcPlug",
    45     'desc'     => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.",
    46     'inherits' => "Yes" };
     43my $options = { 'name'     => "ArcPlug",
     44        'desc'     => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.",
     45        'inherits' => "Yes" };
    4746
    4847sub new {
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r3834 r4744  
    4545
    4646my $unicode_list =
    47 [ { 'name' => "auto",
    48     'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
    49   { 'name' => "ascii",
    50     'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
    51   { 'name' => "utf8",
    52     'desc' => "either utf8 or unicode -- automatically detected." },
    53   { 'name' => "unicode",
    54     'desc' => "just unicode" } ];
     47    [ { 'name' => "auto",
     48    'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
     49      { 'name' => "ascii",
     50    'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
     51      { 'name' => "utf8",
     52    'desc' => "either utf8 or unicode -- automatically detected." },
     53      { 'name' => "unicode",
     54    'desc' => "just unicode" } ];
    5555
    5656my $arguments =
    5757    [ { 'name' => "process_exp",
    58     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
     58    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
    5959    'type' => "string",
    6060    'deft' => "",
     
    6666    'reqd' => "no" },
    6767      { 'name' => "input_encoding",
    68     'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",
     68    'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.",
    6969    'type' => "enum",
    7070    'list' => $unicode_list,
     
    7272    'deft' => "auto" } ,
    7373      { 'name' => "default_encoding",
    74     'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.  The default is iso_8859_1.",
    75     'type' => "flag",
    76     'reqd' => "no" },
     74    'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.",
     75    'type' => "enum",
     76    'reqd' => "no",
     77        'deft' => "utf8" },
    7778      { 'name' => "extract_language",
    7879    'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
     
    107108    'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
    108109    'type' => "int",
     110    'deft' => (localtime)[5]+1900,
    109111    'reqd' => "no"},
    110112      { 'name' => "maximum_century",
    111113    'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
    112114    'type' => "int",
     115    'deft' => "-1",
    113116    'reqd' => "no" },
    114117      { 'name' => "no_bibliography",
     
    196199}
    197200
     201
     202# sub print_usage_new
     203# {
     204# }
     205
     206
     207sub print_usage_new
     208{
     209    local $self = shift(@_);
     210    local $optionlist = $self->{'option_list'};
     211    local $pluginoptions = pop(@$optionlist);
     212    return if (!defined($pluginoptions));
     213
     214    local $pluginname = $pluginoptions->{'name'};
     215    local $pluginargs = $pluginoptions->{'args'};
     216
     217    # Produce the usage information using the data structure above
     218    print STDERR " usage: plugin $pluginname";
     219    if (defined($pluginargs)) {
     220    print STDERR " [options]";
     221    }
     222    print STDERR "\n\n";
     223
     224    # Display the plugin options, if there are some
     225    if (defined($pluginargs)) {
     226    # Find the length of the longest option string
     227    local $maxlength = 0;
     228    foreach $option (@$pluginargs) {
     229        local $optionname = $option->{'name'};
     230        local $optiontype = $option->{'type'};
     231
     232        local $optionstringlength = length($optionname);
     233        if ($optiontype ne "flag") {
     234        $optionstringlength = $optionstringlength + 3 + length($optiontype);
     235        }
     236
     237        # Remember the longest
     238        if ($optionstringlength > $maxlength) {
     239        $maxlength = $optionstringlength;
     240        }
     241    }
     242
     243    # Calculate the column offset of the option descriptions
     244    local $optiondescoffset = 3 + $maxlength + 2;
     245
     246    # Display the plugin options
     247    print STDERR " options:\n";
     248    foreach $option (@$pluginargs) {
     249        # Display option name
     250        local $optionname = $option->{'name'};
     251        print STDERR "  -$optionname";
     252        local $optionstringlength = 3 + length($optionname);
     253 
     254        # Display option type, if the option is not a flag
     255        local $optiontype = $option->{'type'};
     256        if ($optiontype ne "flag") {
     257        print STDERR " <$optiontype>";
     258        $optionstringlength = $optionstringlength + (2 + length($optiontype) + 1);
     259        }
     260
     261        # Display the option description
     262        local $optiondesc = $option->{'desc'};
     263        &display_text_in_column($optiondesc, $optiondescoffset, $optionstringlength, 80);
     264
     265        # Show the default value for the option, if there is one
     266        local $optiondefault = $option->{'deft'};
     267        if (defined($optiondefault)) {
     268        print STDERR " " x $optiondescoffset;
     269        print STDERR "Default: " . $optiondefault . "\n";
     270        }
     271
     272        # If the option has a list of possible values, display these
     273        local $optionvalueslist = $option->{'list'};
     274        if (defined($optionvalueslist)) {
     275        print STDERR "\n";
     276        foreach $optionvalue (@$optionvalueslist) {
     277            local $optionvaluename = $optionvalue->{'name'};
     278            print STDERR " " x $optiondescoffset;
     279            print STDERR "$optionvaluename:";
     280
     281            local $optionvaluedesc = $optionvalue->{'desc'};
     282            &display_text_in_column($optionvaluedesc, ($optiondescoffset + 2),
     283                        $optiondescoffset + length($optionvaluename), 80);
     284        }
     285        }
     286
     287        # Special case for 'input_encoding'
     288        if ($optionname =~ m/^input_encoding$/i) {
     289        my $e = $encodings::encodings;
     290        foreach $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
     291            local $encodingname = $enc;
     292            print STDERR " " x $optiondescoffset;
     293            print STDERR "$enc:";
     294
     295            local $encodingdesc = $e->{$enc}->{'name'};
     296            &display_text_in_column($encodingdesc, ($optiondescoffset + 2),
     297                        $optiondescoffset + length($encodingname), 80);
     298        }
     299        }
     300
     301        # Add a blank line to separate options
     302        print STDERR "\n";
     303    }
     304    }
     305
     306    # If the plugin inherits from another, do the parent now
     307    if (defined($optionlist)) {
     308    $self->print_usage_new();
     309    }
     310}
     311
     312
     313sub display_text_in_column
     314{
     315    local ($text, $columnbeg, $firstlineoffset, $columnend) = @_;
     316
     317    # Spaces are put *before* words, so treat the column beginning as 1 smaller than it is
     318    $columnbeg = $columnbeg - 1;
     319
     320    # Add some padding (if needed) for the first line
     321    local $linelength = $columnbeg;
     322    if ($firstlineoffset < $columnbeg) {
     323    print STDERR " " x ($columnbeg - $firstlineoffset);
     324    }
     325    else {
     326    $linelength = $firstlineoffset;
     327    }
     328
     329    # Break the text into words, and display one at a time
     330    local @words = split(/ /, $text);
     331
     332    foreach $word (@words) {
     333    # Unescape '<' and '>' characters
     334    $word =~ s/&lt;/</g;
     335    $word =~ s/&gt;/>/g;
     336
     337    # If printing this word would except the column end, start a new line
     338    if (($linelength + length($word)) >= $columnend) {
     339        print STDERR "\n";
     340        print STDERR " " x $columnbeg;
     341        $linelength = $columnbeg;
     342    }
     343
     344    # Write the word
     345    print STDERR " " . $word;
     346    $linelength = $linelength + (length($word) + 1);
     347    }
     348
     349    print STDERR "\n";
     350}
     351
     352
    198353sub print_general_usage {
    199354    my ($plugin_name) = @_;
     
    316471             q^process_exp/.*/^, \$self->{'process_exp'},
    317472             q^block_exp/.*/^, \$self->{'block_exp'},
     473             q^extract_language^, \$self->{'extract_language'},
    318474             q^extract_acronyms^, \$self->{'extract_acronyms'},
    319              q^extract_keyphrases^, \$self->{'kea'}, #with extra options
    320              q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options
     475             q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED)
     476             q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED)
    321477             qq^input_encoding/$enc/auto^, \$self->{'input_encoding'},
    322478             qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'},
  • trunk/gsdl/perllib/plugins/BibTexPlug.pm

    r3587 r4744  
    5252
    5353my $arguments =
    54 [ {     'name' => "process_exp",
    55     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
     54    [ { 'name' => "process_exp",
     55    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
    5656    'type' => "string",
    5757    'reqd' => "no" ,
    58     'deft' => q^(?i)\.bib$^ }
    59 ];
    60 
    61 my $options =
    62 {   'name'     => "BibTexPlug",
    63     'desc'     => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.",
    64     'inherits' => "Yes",
    65     'args'     => $arguments };
     58    'deft' => q^(?i)\.bib$^ } ];
     59
     60my $options = { 'name'     => "BibTexPlug",
     61        'desc'     => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.",
     62        'inherits' => "Yes",
     63        'args'     => $arguments };
    6664
    6765# This plugin processes files with the suffix ".bib"
  • trunk/gsdl/perllib/plugins/BookPlug.pm

    r3540 r4744  
    6161
    6262my $arguments =
    63 [ {     'name' => "process_exp",
    64     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
     63    [ { 'name' => "process_exp",
     64    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
    6565    'type' => "string",
    6666    'reqd' => "no",
    67     'deft' => q^(?i)\.hb$^} ,
    68 {   'name' => "block_exp",
     67    'deft' => &get_default_process_exp() },
     68      { 'name' => "block_exp",
    6969    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    7070    'type' => "string",
    7171    'reqd' => "no",
    72     'deft' => q^\.jpg$^}
    73 ];
    74 
    75 my $options =
    76 {   'name'     => "BookPlug",
    77     'desc'     => "Creates multi-level document from document containing &lt;&lt;TOC&gt;&gt; level tags. Metadata for each section is taken from any other tags on the same line as the &lt;&lt;TOC&gt;&gt;. e.g. &lt;&lt;Title&gt;&gt;xxxx&lt;&lt;/Title&gt;&gt; sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around &lt;&lt;TOC&gt;&gt; tags, uses &lt;&lt;I&gt;&gt; tags to specify images, and simply takes all text between &lt;&lt;TOC&gt;&gt; tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
    78     'inherits' => "Yes",
    79     'args'     => $arguments };
     72    'deft' => &get_default_block_exp() } ];
     73
     74my $options = { 'name'     => "BookPlug",
     75        'desc'     => "Creates multi-level document from document containing &lt;&lt;TOC&gt;&gt; level tags. Metadata for each section is taken from any other tags on the same line as the &lt;&lt;TOC&gt;&gt;. e.g. &lt;&lt;Title&gt;&gt;xxxx&lt;&lt;/Title&gt;&gt; sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around &lt;&lt;TOC&gt;&gt; tags, uses &lt;&lt;I&gt;&gt; tags to specify images, and simply takes all text between &lt;&lt;TOC&gt;&gt; tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.",
     76        'inherits' => "Yes",
     77        'args'     => $arguments };
    8078
    8179sub new {
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r3720 r4744  
    5050
    5151my $convert_to_list =
    52 [ { 'name' => "html",
    53     'desc' => "" },
    54 {   'name' => "text",
    55     'desc' => "" }
    56 ];
     52    [ { 'name' => "html",
     53    'desc' => "HTML format" },
     54      { 'name' => "text",
     55    'desc' => "Plain text format" } ];
    5756
    5857my $arguments =
    59 [ {     'name' => "convert_to",
    60     'desc' => "Plugin converts to TEXT or HTML (default html).",
     58    [ { 'name' => "convert_to",
     59    'desc' => "Plugin converts to TEXT or HTML.",
    6160    'type' => "enum",
    6261    'reqd' => "no",
    6362    'list' => $convert_to_list,
    64     'deft' => "html"}
    65 ];
    66 
    67 my $options =
    68 {  'name'     => "ConvertToPlug",
    69    'desc'     => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'.  If the argument is not present, the default is to inherit HTMLPlug.",
    70    'inherits' => "Yes",
    71    'args'     => $arguments };
     63    'deft' => "html" },
     64      { 'name' => "use_strings",
     65    'desc' => "If set, a simple strings function will be called to extract text if the conversion utility fails.",
     66    'type' => "flag",
     67    'reqd' => "no" } ];
     68
     69my $options = { 'name'     => "ConvertToPlug",
     70        'desc'     => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'.  If the argument is not present, the default is to inherit HTMLPlug.",
     71        'inherits' => "Yes",
     72        'args'     => $arguments };
    7273
    7374
     
    100101
    101102    if (!parsargv::parse($args, 
    102              q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options
    103              q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options
     103             q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented)
     104             q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented)
    104105             q^convert_to/(html|text)/html^, \$newargs->{'generate_format'},
    105106             q^use_strings^, \$newargs->{'use_strings'},
     
    117118sub new {
    118119    my $class = shift (@_);
    119     if ($class eq "ConvertToPlug") {$class = shift (@_);}
     120    # print "Class: " . $class . "\n";
     121    # if ($class eq "ConvertToPlug") {$class = shift (@_);}
    120122    my $self;
    121123    # parsargv::parse might modify the list, so we do this by creating a copy
  • trunk/gsdl/perllib/plugins/ConvertToRogPlug.pm

    r3737 r4744  
    3535    @ISA = ('RogPlug');
    3636}
     37
     38my $options = { 'name'     => "ConvertToRogPlug",
     39        'desc'     => "A plugin that inherits from RogPlug.",
     40        'inherits' => "Yes" };
    3741
    3842sub print_usage {
     
    7175sub new {
    7276    my $class = shift (@_);
    73     if ($class eq "ConvertToRogPlug") {$class = shift (@_);}
     77    # print "Class: " . $class . "\n";
     78    # if ($class eq "ConvertToRogPlug") {$class = shift (@_);}
    7479    my $self;
    7580    # parsargv::parse might modify the list, so we do this by creating a copy
     
    8186    $self->{'convert_to'} = "Rog";
    8287    $self->{'convert_to_ext'} = "rog";
     88
     89    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     90    my $option_list = $self->{'option_list'};
     91    push( @{$option_list}, $options );
    8392
    8493    return bless $self, $class;
  • trunk/gsdl/perllib/plugins/DBPlug.pm

    r4429 r4744  
    4444}
    4545
    46 my $arguments = [ { 'name' => "process_exp",
    47                           'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    48                           'type' => "string",
    49                           'deft' => q^(?i)\.dbi$^,
    50                           'reqd' => "no" } ,
    51                         { 'name' => "title_sub",
    52                           'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
    53                           'type' => "string",
    54                           'reqd' => "no" }];
     46my $arguments =
     47    [ { 'name' => "process_exp",
     48    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     49    'type' => "string",
     50    'deft' => &get_default_process_exp(),
     51    'reqd' => "no" } ,
     52      { 'name' => "title_sub",
     53    'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
     54    'type' => "string",
     55    'deft' => "",
     56    'reqd' => "no" }];
    5557
    5658my $options = { 'name'     => "DBPlug",
    57                  'desc'     => "Uses records from a database as documents.",
    58                      'inherits' => "yes",
    59                      'args'     => $arguments };
     59        'desc'     => "Uses records from a database as documents.",
     60        'inherits' => "yes",
     61        'args'     => $arguments };
    6062
    6163sub print_usage {
  • trunk/gsdl/perllib/plugins/EMAILPlug.pm

    r4224 r4744  
    9090
    9191my $arguments =
    92 [ {     'name' => "process_exp",
    93     'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
     92    [ { 'name' => "process_exp",
     93    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
    9494    'type' => "string",
    9595    'reqd' => "no",
    96     'deft' => q@([\\/]\d+|\.(mbx|email|eml))$@
    97   },
    98   {     'name' => "no_attachments",
     96    'deft' => &get_default_process_exp() },
     97      { 'name' => "no_attachments",
    9998    'desc' => "Do not save message attachments.",
    10099    'type' => "flag",
    101     'reqd' => "no"
    102   },
    103   {     'name' => "block_exp",
    104     'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     100    'reqd' => "no" },
     101      { 'name' => "split_exp",
     102    'desc' => "A perl regular expression used to split files containing many messages into individual documents.",
    105103    'type' => "string",
    106     'reqd' => "no",
    107     'deft' => q^^}
    108 ];
    109 
    110 my $options =
    111 {   'name'     => "EMAILPlug",
    112     'desc'     => "Email plug reads email files.  These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers      All the header content\n\t\$Subject      Subject: header\n\t\$To           To: header\n\t\$From         From: header\n\t\$FromName     Name of sender (where available)\n\t\$FromAddr     E-mail address of sender\n\t\$DateText     Date: header\n\t\$Date         Date: header in GSDL format (eg: 19990924)",
    113     'inherits' => "Yes",
    114     'args'     => $arguments };
     104    'deft' => "" } ];
     105
     106my $options = { 'name'     => "EMAILPlug",
     107        'desc'     => "Email plug reads email files.  These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers      All the header content\n\t\$Subject      Subject: header\n\t\$To           To: header\n\t\$From         From: header\n\t\$FromName     Name of sender (where available)\n\t\$FromAddr     E-mail address of sender\n\t\$DateText     Date: header\n\t\$Date         Date: header in GSDL format (eg: 19990924)",
     108        'inherits' => "Yes",
     109        'args'     => $arguments };
    115110
    116111# Create a new EMAILPlug object with which to parse a file.
  • trunk/gsdl/perllib/plugins/ExcelPlug.pm

    r2990 r4744  
    3434}
    3535
     36my $options = { 'name'     => "ExcelPlug",
     37        'desc'     => "A plugin for importing Microsoft Excel files.",
     38        'inherits' => "Yes" };
     39
    3640sub new {
    3741    my $class = shift (@_);
     
    4549#   $self->{'input_encoding'} = "utf8";
    4650#    }
     51
     52    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     53    my $option_list = $self->{'option_list'};
     54    push( @{$option_list}, $options );
    4755
    4856    return bless $self, $class;
  • trunk/gsdl/perllib/plugins/FOXPlug.pm

    r3540 r4744  
    3838use unicode;
    3939use cnseg;
    40 use gb;
     40# use gb;
    4141
    4242
  • trunk/gsdl/perllib/plugins/HBPlug.pm

    r3542 r4744  
    7373
    7474    $self->BasPlug::init($verbosity, $outhandle);
     75    $self->{'input_encoding'} = "iso_8859_1";
    7576
    7677    # this plugin only handles ascii encodings
  • trunk/gsdl/perllib/plugins/HTMLPlug.pm

    r3708 r4744  
    4848}
    4949
    50 my $arguments = [ { 'name' => "process_exp",
    51             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    52             'type' => "string",
    53             'deft' =>  q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php|\.cgi|.+\?.+=.*)$^ },
    54           { 'name' => "block_exp",
    55             'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    56             'type' => 'string',
    57             'deft' =>  q^(?i)\.(gif|jpe?g|png|css)$^ },
    58           { 'name' => "nolinks",
    59             'desc' =>  "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
    60             'type' => "flag" },
    61           { 'name' => "keep_head",
    62             'desc' => "Don't remove headers from html files.",
    63             'type' => "flag" },
    64           { 'name' => "no_metadata",
    65             'desc' => "Don't attempt to extract any metadata from files.",
    66             'type' => "flag" },
    67           { 'name' => "metadata_fields",
    68             'desc' => "Comma separated list of metadata fields to attempt to extract. Defaults to 'Title'. Use 'tag&lt;tagname&gt;' to have the contents of the first &lt;tagname &gt; pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
    69             'type' => "metadatum",
    70             'deft' => "" },
    71           { 'name' => "hunt_creator_metadata",
    72             'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
    73             'type' => "flag" },
    74           { 'name' => "file_is_url",
    75             'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
    76             'type' => "flag" },
    77           { 'name' => "assoc_files",
    78             'desc' => "Perl regular expression of file extensions to associate with html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'",
    79             'type' => "string",
    80             'deft' => q^(?i)\.(jpe?g|gif|png|css)\$^ },
    81           { 'name' => "rename_assoc_files",
    82             'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
    83             'type' => "flag" } ,
    84           { 'name' => "title_sub",
    85             'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
    86             'type' => "string" } ,
    87           { 'name' => "description_tags",
    88             'desc' => "Split document into sub-sections where &lt;Section&gt; tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the &lt;Section&gt; tags. Also, '-keep_head' will have no effect when this option is set.",
    89             'type' => "flag" } ];
     50my $arguments =
     51    [ { 'name' => "process_exp",
     52    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin.  For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     53    'type' => "string",
     54    'deft' =>  &get_default_process_exp() },
     55      { 'name' => "block_exp",
     56    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     57    'type' => 'string',
     58    'deft' =>  &get_default_block_exp() },
     59      { 'name' => "nolinks",
     60    'desc' =>  "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).",
     61    'type' => "flag" },
     62      { 'name' => "keep_head",
     63    'desc' => "Don't remove headers from html files.",
     64    'type' => "flag" },
     65      { 'name' => "no_metadata",
     66    'desc' => "Don't attempt to extract any metadata from files.",
     67    'type' => "flag" },
     68      { 'name' => "metadata_fields",
     69    'desc' => "Comma separated list of metadata fields to attempt to extract. Use 'tag&lt;tagname&gt;' to have the contents of the first &lt;tagname&gt; pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.",
     70    'type' => "metadatum",
     71    'deft' => "Title" },
     72      { 'name' => "hunt_creator_metadata",
     73    'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.",
     74    'type' => "flag" },
     75      { 'name' => "file_is_url",
     76    'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.",
     77    'type' => "flag" },
     78      { 'name' => "assoc_files",
     79    'desc' => "Perl regular expression of file extensions to associate with html documents.",
     80    'type' => "string",
     81    'deft' => q^(?i)\.(jpe?g|gif|png|css)$^ },
     82      { 'name' => "rename_assoc_files",
     83    'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).",
     84    'type' => "flag" },
     85      { 'name' => "title_sub",
     86    'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.",
     87    'type' => "string",
     88    'deft' => "" },
     89      { 'name' => "description_tags",
     90    'desc' => "Split document into sub-sections where &lt;Section&gt; tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the &lt;Section&gt; tags. Also, '-keep_head' will have no effect when this option is set.",
     91    'type' => "flag" } ];
    9092
    9193my $options = { 'name'     => "HTMLPlug",
     
    9496        'args'     => $arguments };
    9597
    96 sub print_usage {
    97     print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
    98     print STDERR "  options:\n";
    99     print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
    100     print STDERR "                          flag may improve speed of building/importing but\n";
    101     print STDERR "                          any relative links within documents will be broken).\n";
    102     print STDERR "   -keep_head             Don't remove headers from html files.\n";
    103     print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
    104     print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
    105                           extract. Defaults to 'Title'.
    106                           Use 'tag<tagname>' to have the contents of the first
    107                           <tagname> pair put in a metadata element called
    108                           'tagname'. Capitalise this as you want the metadata
    109                           capitalised in Greenstone, since the tag extraction
    110                           is case insensitive.\n";
    111     print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
    112                           place it in the 'Creator' field. Requires the
    113                           -metadata_fields flag.\n";
    114     print STDERR "   -file_is_url           Set if input filenames make up url of original source
    115                           documents e.g. if a web mirroring tool was used to
    116                           create the import directory structure\n";
    117     print STDERR "   -assoc_files           Perl regular expression of file extensions to
    118                           associate with html documents.
    119                           Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
    120     print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
    121                           Also creates much shallower directory structure
    122                           (useful when creating collections to go on cd-rom).\n";
    123     print STDERR "   -title_sub             Substitution expression to modify string stored as
    124                           Title. Used by, for example, PDFPlug to remove
    125                           \"Page 1\", etc from text used as the title.\n";
    126     print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
    127                           occur. Note that by setting this option you
    128                           implicitly set -no_metadata, as all metadata should
    129                           be included within the <Section> tags (this is only
    130                           true for documents that actually contain <Section> tags
    131                           however). Also, '-keep_head' will have no effect when
    132                           this option is set, regardless of whether a document
    133                           contains Section tags.\n";
    134 }
     98
     99#  sub print_usage {
     100#      print STDERR "\n  usage: plugin HTMLPlug [options]\n\n";
     101#      print STDERR "  options:\n";
     102#      print STDERR "   -nolinks               Don't make any attempt to trap links (setting this\n";
     103#      print STDERR "                          flag may improve speed of building/importing but\n";
     104#      print STDERR "                          any relative links within documents will be broken).\n";
     105#      print STDERR "   -keep_head             Don't remove headers from html files.\n";
     106#      print STDERR "   -no_metadata           Don't attempt to extract any metadata from files.\n";
     107#      print STDERR "   -metadata_fields       Comma separated list of metadata fields to attempt to
     108#                            extract. Defaults to 'Title'.
     109#                            Use 'tag<tagname>' to have the contents of the first
     110#                            <tagname> pair put in a metadata element called
     111#                            'tagname'. Capitalise this as you want the metadata
     112#                            capitalised in Greenstone, since the tag extraction
     113#                            is case insensitive.\n";
     114#      print STDERR "   -hunt_creator_metadata Find as much metadata as possible on authorship and
     115#                            place it in the 'Creator' field. Requires the
     116#                            -metadata_fields flag.\n";
     117#      print STDERR "   -file_is_url           Set if input filenames make up url of original source
     118#                            documents e.g. if a web mirroring tool was used to
     119#                            create the import directory structure\n";
     120#      print STDERR "   -assoc_files           Perl regular expression of file extensions to
     121#                            associate with html documents.
     122#                            Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n";
     123#      print STDERR "   -rename_assoc_files    Renames files associated with documents (e.g. images).
     124#                            Also creates much shallower directory structure
     125#                            (useful when creating collections to go on cd-rom).\n";
     126#      print STDERR "   -title_sub             Substitution expression to modify string stored as
     127#                            Title. Used by, for example, PDFPlug to remove
     128#                            \"Page 1\", etc from text used as the title.\n";
     129#      print STDERR "   -description_tags      Split document into sub-sections where <Section> tags
     130#                            occur. Note that by setting this option you
     131#                            implicitly set -no_metadata, as all metadata should
     132#                            be included within the <Section> tags (this is only
     133#                            true for documents that actually contain <Section> tags
     134#                            however). Also, '-keep_head' will have no effect when
     135#                            this option is set, regardless of whether a document
     136#                            contains Section tags.\n";
     137#  }
    135138
    136139sub new {
  • trunk/gsdl/perllib/plugins/ImagePlug.pm

    r4724 r4744  
    3232}
    3333
    34 
    35 
    36 my $arguments = [ { 'name' => "process_exp",
    37             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    38             'type' => "string",
    39             'deft' => q^(?i)(\.jpe?g|\.gif|\.png|\.bmp|\.xbm|\.tif?f)$^,
    40             'reqd' => "no" },
    41           { 'name' => "noscaleup",
    42             'desc' => "Don't scale up small images when making thumbnails.",
    43             'type' => "flag",
    44             'reqd' => "no" },
    45           { 'name' => "thumbnailsize",
    46             'desc' => "Make thumbnails of size nxn.",
    47             'type' => "int",
    48             'reqd' => "no" },
    49           { 'name' => "thumbnailtype",
    50             'desc' => "Make thumbnails in format 's'.",
    51             'type' => "string",
    52             'reqd' => "no" },
    53           { 'name' => "screenviewsize",
    54             'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata.  By default it is not set.",
    55             'type' => "int",
    56             'reqd' => "no" },
    57           { 'name' => "screenviewtype",
    58             'desc' => "If -screenviewsize is set, this sets the screen display image type.  Defaults to jpg.",
    59             'type' => "string",
    60             'deft' => "jpg",
    61             'reqd' => "no" },
    62           { 'name' => "convertto",
    63             'desc' => "Convert main image to.",
    64             'type' => "string",
    65             'reqd' => "no" },
    66           { 'name' => "minimumsize",
    67             'desc' => "Ignore images smaller than n bytes.",
    68             'type' => "int",
    69             'reqd' => "no" } ];
     34my $arguments =
     35    [ { 'name' => "process_exp",
     36    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     37    'type' => "string",
     38    'deft' => &get_default_process_exp(),
     39    'reqd' => "no" },
     40      { 'name' => "noscaleup",
     41    'desc' => "Don't scale up small images when making thumbnails.",
     42    'type' => "flag",
     43    'reqd' => "no" },
     44      { 'name' => "thumbnailsize",
     45    'desc' => "Make thumbnails of size nxn.",
     46    'type' => "int",
     47    'deft' => "100",
     48    'reqd' => "no" },
     49      { 'name' => "thumbnailtype",
     50    'desc' => "Make thumbnails in format 's'.",
     51    'type' => "string",
     52    'deft' => "gif",
     53    'reqd' => "no" },
     54      { 'name' => "screenviewsize",
     55    'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata.  By default it is not set.",
     56    'type' => "int",
     57    'deft' => "0",
     58    'reqd' => "no" },
     59      { 'name' => "screenviewtype",
     60    'desc' => "If -screenviewsize is set, this sets the screen display image type.",
     61    'type' => "string",
     62    'deft' => "jpg",
     63    'reqd' => "no" },
     64      { 'name' => "converttotype",
     65    'desc' => "Convert main image to.",
     66    'type' => "string",
     67    'deft' => "",
     68    'reqd' => "no" },
     69      { 'name' => "minimumsize",
     70    'desc' => "Ignore images smaller than n bytes.",
     71    'type' => "int",
     72    'deft' => "100",
     73    'reqd' => "no" } ];
    7074
    7175my $options = { 'name'     => "ImagePlug",
  • trunk/gsdl/perllib/plugins/MACROPlug.pm

    r3856 r4744  
    3434}
    3535
    36 my $arguments = [ { 'name' => "process_exp",
    37             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    38             'type' => "string",
    39             'deft' => q^(?i)\.dm$^,
    40             'reqd' => "no" }];
     36my $arguments =
     37    [ { 'name' => "process_exp",
     38    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     39    'type' => "string",
     40    'deft' => &get_default_process_exp(),
     41    'reqd' => "no" } ];
    4142
    4243my $options = { 'name'     => "MACROPlug",
     
    100101    push( @{$option_list}, $options );
    101102
    102     $self->{'lang_abbr'} = load_language_table();
     103    # $self->{'lang_abbr'} = load_language_table();
    103104
    104105    return bless $self, $class;
  • trunk/gsdl/perllib/plugins/MARCPlug.pm

    r3508 r4744  
    3737    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
    3838}
     39
     40my $arguments =
     41    [ { 'name' => "metadata_mapping",
     42    'desc' => "Name of file that includes mapping details from MARC values to Greenstone metadata names. Defaults to 'marctodc.txt' found in the site's etc directory.",
     43    'type' => "string",
     44    'deft' => "marctodc.txt",
     45    'reqd' => "no" } ];
     46
     47my $options = { 'name'     => "MARCPlug",
     48        'desc'     => "",
     49        'inherits' => "Yes",
     50        'args'     => $arguments };
    3951
    4052use MARC::Record; 
     
    6779
    6880    $self->{'mm_file'} = $metadata_mapping; # relative to etc dir
     81
     82    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     83    my $option_list = $self->{'option_list'};
     84    push( @{$option_list}, $options );
    6985
    7086    return bless $self, $class;
  • trunk/gsdl/perllib/plugins/PDFPlug.pm

    r4103 r4744  
    3232}
    3333
    34 my $arguments = [ { 'name' => "process_exp",
    35             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    36             'type' => "string",
    37             'deft' => q^(?i)\.pdf$^,
    38             'reqd' => "no" },
    39           { 'name' => "block_exp",
    40             'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    41             'type' => 'string',
    42             'deft' =>  q^^ },
    43           { 'name' => "noimages",
    44             'desc' =>  "Don't attempt to extract images from PDF.",
    45             'type' => "flag" },
    46           { 'name' => "complex",
    47             'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
    48             'type' => "flag" },
    49           { 'name' => "nohidden",
    50             'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.",
    51             'type' => "flag" },
    52           { 'name' => "zoom",
    53             'desc' =>  "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).",
    54             'type' => "int" }
    55           ];
     34my $arguments =
     35    [ { 'name' => "process_exp",
     36    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     37    'type' => "string",
     38    'deft' => &get_default_process_exp(),
     39    'reqd' => "no" },
     40      { 'name' => "block_exp",
     41    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     42    'type' => "string",
     43    'deft' =>  q^^ },
     44      { 'name' => "noimages",
     45    'desc' => "Don't attempt to extract images from PDF.",
     46    'type' => "flag" },
     47      { 'name' => "complex",
     48    'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
     49    'type' => "flag" },
     50      { 'name' => "nohidden",
     51    'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.",
     52    'type' => "flag" },
     53      { 'name' => "zoom",
     54    'desc' => "The factor by which to zoom the PDF for output (this is only useful if -complex is set).",
     55    'deft' => "2",
     56    'type' => "int" },
     57      { 'name' => "use_sections",
     58    'desc' => "Create a separate section for each page of the PDF file.",
     59    'type' => "flag" } ];
    5660
    5761my $options = { 'name'     => "PDFPlug",
  • trunk/gsdl/perllib/plugins/PPTPlug.pm

    r2981 r4744  
    3434}
    3535
     36my $options = { 'name'     => "PPTPlug",
     37        'desc'     => "A plugin for importing Microsoft PowerPoint files.",
     38        'inherits' => "Yes" };
     39
    3640sub new {
    3741    my $class = shift (@_);
     
    4347    $self->{'input_encoding'} = "utf8";
    4448    }
     49
     50    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     51    my $option_list = $self->{'option_list'};
     52    push( @{$option_list}, $options );
    4553
    4654    return bless $self, $class;
  • trunk/gsdl/perllib/plugins/PSPlug.pm

    r3540 r4744  
    3535}
    3636
    37 my $arguments = [ { 'name' => "process_exp",
    38             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    39             'type' => "string",
    40             'deft' => q^(?i)\.ps$^,
    41             'reqd' => "no" },
    42           { 'name' => "block_exp",
    43             'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    44             'type' => 'string',
    45             'deft' =>  q^(?i)\.(eps)$^ }
    46           ];
     37my $arguments =
     38    [ { 'name' => "process_exp",
     39    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     40    'type' => "string",
     41    'deft' => &get_default_process_exp(),
     42    'reqd' => "no" },
     43      { 'name' => "block_exp",
     44    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     45    'type' => 'string',
     46    'deft' => &get_default_block_exp() },
     47      { 'name' => "extract_date",
     48    'desc' => "Extract date from PS header.",
     49    'type' => "flag" },
     50      { 'name' => "extract_pages",
     51    'desc' => "Extract pages from PS header.",
     52    'type' => "flag" },
     53      { 'name' => "extract_title",
     54    'desc' => "Extract title from PS header.",
     55    'type' => "flag" } ];
    4756
    4857my $options = { 'name'     => "PSPlug",
  • trunk/gsdl/perllib/plugins/RTFPlug.pm

    r3540 r4744  
    3535}
    3636
    37 my $arguments = [ { 'name' => "process_exp",
    38             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    39             'type' => "string",
    40             'deft' => q^(?i)\.rtf$^,
    41             'reqd' => "no" }
    42           ];
     37my $arguments =
     38    [ { 'name' => "process_exp",
     39    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     40    'type' => "string",
     41    'deft' => &get_default_process_exp(),
     42    'reqd' => "no" } ];
    4343
    4444my $options = { 'name'     => "RTFPlug",
  • trunk/gsdl/perllib/plugins/RecPlug.pm

    r3540 r4744  
    106106use XML::Parser;
    107107
    108 my $arguments = [ { 'name' => "block_exp",
    109             'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    110             'type' => "string",
    111             'deft' => "CVS",
    112             'reqd' => "no" },
    113           { 'name' => "use_metadata_files",
    114             'desc' => "Read metadata from metadata XML files.",
    115             'type' => "flag",
    116             'reqd' => "no" } ];
     108my $arguments =
     109    [ { 'name' => "block_exp",
     110    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     111    'type' => "string",
     112    'deft' => &get_default_block_exp(),
     113    'reqd' => "no" },
     114      { 'name' => "use_metadata_files",
     115    'desc' => "Read metadata from metadata XML files.",
     116    'type' => "flag",
     117    'reqd' => "no" } ];
    117118
    118119my $options = { 'name'     => "RecPlug",
    119         'desc'     => "RecPlug is a plugin which recurses through directories processing
    120 # each file it finds. For detailed comments edit &lt;GSDLHOME&gt;/perllib/plugins/RecPlug.pm .",
    121             'inherits' => "yes",
    122             'args'     => $arguments };
     120        'desc'     => "RecPlug is a plugin which recurses through directories processing each file it finds.",
     121        'inherits' => "yes",
     122        'args'     => $arguments };
    123123
    124124sub print_usage {
  • trunk/gsdl/perllib/plugins/ReferPlug.pm

    r3540 r4744  
    7272}
    7373
    74 my $arguments = [ { 'name' => "process_exp",
    75             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    76             'type' => "string",
    77             'deft' => q^(?i)\.bib$^,
    78             'reqd' => "no" } ];
     74my $arguments =
     75    [ { 'name' => "process_exp",
     76    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     77    'type' => "string",
     78    'deft' => &get_default_process_exp(),
     79    'reqd' => "no" } ];
    7980
    8081my $options = { 'name'     => "ReferPlug",
  • trunk/gsdl/perllib/plugins/RogPlug.pm

    r3737 r4744  
    3636}
    3737
     38my $options = { 'name'     => "RogPlug",
     39        'desc'     => "Creates simple single-level documents from .rog or .mdb files.",
     40        'inherits' => "Yes" };
     41
    3842sub new {
    3943    my ($class) = @_;
    4044    $self = new BasPlug ();
     45
     46    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     47    my $option_list = $self->{'option_list'};
     48    push( @{$option_list}, $options );
    4149
    4250    return bless $self, $class;
  • trunk/gsdl/perllib/plugins/SRCPlug.pm

    r3919 r4744  
    4646}
    4747
    48 my $arguments = [ { 'name' => "process_exp",
    49             'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    50             'type' => "string",
    51             'deft' => q^(Makefile.*|README.*|(?i)\.(c|cc|cpp|C|h|hpp|pl|pm|sh))$^,
    52             'reqd' => "no" } ,
    53           { 'name' => "block_exp",
    54             'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    55             'type' => 'string',
    56             'deft' => q^(?i)\.(o|obj|a|so|dll)$^,
    57             'reqd' => "no" } ,
    58           { 'name' => "remove_prefix",
    59             'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.",
    60             'type' => 'string',
    61             'reqd' => "no" } ];
     48my $arguments =
     49    [ { 'name' => "process_exp",
     50    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     51    'type' => "string",
     52    'deft' => &get_default_process_exp(),
     53    'reqd' => "no" } ,
     54      { 'name' => "block_exp",
     55    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     56    'type' => 'string',
     57    'deft' => &get_default_block_exp(),
     58    'reqd' => "no" } ,
     59      { 'name' => "remove_prefix",
     60    'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.",
     61    'type' => 'string',
     62    'deft' => "",
     63    'reqd' => "no" } ];
    6264
    6365my $options = { 'name'     => "SRCPlug",
    6466        'desc'     => "Filename is currently used for Title ( optionally minus some prefix ). Current languages:\ntext: READMEs/Makefiles\nC/C++   (currently extracts #include statements and C++ class decls)\nPerl    (currently only done as text)\nShell   (currently only done as text)",
    65             'inherits' => "yes",
    66             'args'     => $arguments };
     67        'inherits' => "yes",
     68        'args'     => $arguments };
    6769
    6870sub print_usage {
  • trunk/gsdl/perllib/plugins/SplitPlug.pm

    r3540 r4744  
    4949}
    5050
     51my $arguments =
     52    [ { 'name' => "split_exp",
     53    'desc' => "A perl regular expression to split input files into segments.",
     54    'type' => "string",
     55    'deft' => &get_default_split_exp(),
     56    'reqd' => "no" }
     57      ];
     58
    5159my $options = { 'name'     => "SplitPlug",
    52                  'desc'     => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly.  Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug.  See ReferPlug for an example.",
    53                      'inherits' => "yes" };
     60        'desc'     => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly.  Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug.  See ReferPlug for an example.",
     61        'inherits' => "yes",
     62            'args'     => $arguments };
    5463
    5564
  • trunk/gsdl/perllib/plugins/TEXTPlug.pm

    r3932 r4744  
    3939}
    4040
    41 my $arguments = [ { 'name' => "process_exp",
    42                           'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    43                           'type' => "string",
    44                           'deft' => q^(?i)\.te?xt$^,
    45                           'reqd' => "no" } ,
    46                         { 'name' => "title_sub",
    47                           'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
    48                           'type' => "string",
    49                           'reqd' => "no" }];
     41my $arguments =
     42    [ { 'name' => "process_exp",
     43    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     44    'type' => "string",
     45    'deft' => &get_default_process_exp(),
     46    'reqd' => "no" } ,
     47      { 'name' => "title_sub",
     48    'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.",
     49    'type' => "string",
     50    'deft' => "",
     51    'reqd' => "no" } ];
    5052
    5153my $options = { 'name'     => "TEXTPlug",
    52                  'desc'     => "Creates simple single-level document. Adds Title metadata of first line of text (up to 100 characters long).",
    53                      'inherits' => "yes",
    54                      'args'     => $arguments };
     54        'desc'     => "Creates simple single-level document. Adds Title metadata of first line of text (up to 100 characters long).",
     55        'inherits' => "yes",
     56        'args'     => $arguments };
    5557
    5658sub print_usage {
     
    6870    my $self = new BasPlug ($class, @_);
    6971
    70     # 14-05-02 To allow for proper inheritance of arguments - John Thompson
    71     my $option_list = $self->{'option_list'};
    72     push( @{$option_list}, $options );
     72    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     73    my $option_list = $self->{'option_list'};
     74    push( @{$option_list}, $options );
    7375
    7476    if (!parsargv::parse(\@_,
  • trunk/gsdl/perllib/plugins/UnknownPlug.pm

    r2883 r4744  
    6363    @ISA = ('BasPlug');
    6464}
     65
     66my $arguments =
     67    [ { 'name' => "assoc_field",
     68    'desc' => "Name of the metadata field that will hold the associated file's name.",
     69    'type' => "string",
     70    'deft' => "",
     71    'reqd' => "no" } ,
     72      { 'name' => "file_type",
     73    'desc' => "Mime type of the file (e.g. image/gif)",
     74    'type' => "string",
     75    'deft' => "",
     76    'reqd' => "no" } ];
     77
     78my $options = { 'name'     => "UnknownPlug",
     79        'desc'     => "This is a simple Plugin for importing files in formats that Greenstone doesn't know anything about.  A fictional document will be created for every such file, and the file itself will be passed to Greenstone as the \"associated file\" of the document.",
     80        'inherits' => "yes",
     81        'args'     => $arguments };
    6582
    6683sub print_usage {
     
    7895    my $self = new BasPlug ($class, @_);
    7996
     97    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     98    my $option_list = $self->{'option_list'};
     99    push( @{$option_list}, $options );
     100
    80101    if (!parsargv::parse(\@_,
    81102             q^assoc_field/.*/^, \$self->{'assoc_field'},
  • trunk/gsdl/perllib/plugins/W3ImgPlug.pm

    r2996 r4744  
    122122    @ISA = qw( HTMLPlug );
    123123}
     124
     125my $aggressiveness_list =
     126    [ { 'name' => "1",
     127    'desc' => "Filename, path, ALT text only." },
     128      { 'name' => "2",
     129    'desc' => "All of 1, plus caption where available." },
     130      { 'name' => "3",
     131    'desc' => "All of 2, plus near paragraphs where available." },
     132      { 'name' => "4",
     133    'desc' => "All of 3, plus previous headers (<h1>, <h2>...) where available." },
     134      { 'name' => "5",
     135    'desc' => "All of 4, plus textual references where available." },
     136      { 'name' => "6",
     137    'desc' => "All of 4, plus page metatags (title, keywords, etc)." },
     138      { 'name' => "7",
     139    'desc' => "All of 6, 5 and 4 combined." },
     140      { 'name' => "8",
     141    'desc' => "All of 7, plus repeat caption, filename, etc (raise ranking of more relevant results)." },
     142      { 'name' => "9",
     143    'desc' => "All of 1, plus full text of source page." } ];
     144
     145my $arguments =
     146    [ { 'name' => "aggressiveness",
     147    'desc' => "Range of related text extraction techniques to use.",
     148    'type' => "int",
     149    'list' => $aggressiveness_list,
     150    'deft' => "3",
     151    'reqd' => "no" },
     152      { 'name' => "index_pages",
     153    'desc' => "Index the pages along with the images. Otherwise reference the pages at the source URL.",
     154    'type' => "flag",
     155    'reqd' => "no" },
     156      { 'name' => "no_cache_images",
     157    'desc' => "Don't cache images (point to URL of original)",
     158    'type' => "flag",
     159    'reqd' => "no" },
     160      { 'name' => "min_size",
     161    'desc' => "Bytes. Skip images smaller than this.",
     162    'type' => "int",
     163    'deft' => "2000",
     164    'reqd' => "no" },
     165      { 'name' => "min_width",
     166    'desc' => "Pixels. Skip images narrower than this.",
     167    'type' => "int",
     168    'deft' => "50",
     169    'reqd' => "no" },
     170      { 'name' => "min_height",
     171    'desc' => "Pixels. Skip images shorter than this.",
     172    'type' => "int",
     173    'deft' => "50",
     174    'reqd' => "no" },
     175      { 'name' => "thumb_size",
     176    'desc' => "Max thumbnail size. Both width and height.",
     177    'type' => "int",
     178    'deft' => "100",
     179    'reqd' => "no" },
     180      { 'name' => "convert_params",
     181    'desc' => "Additional parameters for ImageMagicK convert on thumbnail creation. For example, '-raise' will give a three dimensional effect to thumbnail images.",
     182    'type' => "string",
     183    'deft' => "",
     184    'reqd' => "no" },
     185      { 'name' => "min_near_text",
     186    'desc' => "Minimum characters of near text or caption to extract.",
     187    'type' => "int",
     188    'deft' => "10",
     189    'reqd' => "no" },
     190      { 'name' => "max_near_text",
     191    'desc' => "Maximum characters near images to extract.",
     192    'type' => "int",
     193    'deft' => "400",
     194    'reqd' => "no" },
     195      { 'name' => "smallpage_threshold",
     196    'desc' => "Images on pages smaller than this (bytes) will have the page (title, keywords, etc) meta-data added.",
     197    'type' => "int",
     198    'deft' => "2048",
     199    'reqd' => "no" },
     200      { 'name' => "textrefs_threshold",
     201    'desc' => "Threshold for textual references. Lower values mean the algorithm is less strict.",
     202    'type' => "int",
     203    'deft' => "2",
     204    'reqd' => "no" },
     205      { 'name' => "caption_length",
     206    'desc' => "Maximum length of captions (in characters).",
     207    'type' => "int",
     208    'deft' => "80",
     209    'reqd' => "no" },
     210      { 'name' => "neartext_length",
     211    'desc' => "Target length of near text (in characters).",
     212    'type' => "int",
     213    'deft' => "300",
     214    'reqd' => "no" },
     215      { 'name' => "document_text",
     216    'desc' => "Add image text as document:text (otherwise IndexedText metadata field).",
     217    'type' => "flag",
     218    'reqd' => "no" }
     219      ];
     220
     221my $options = { 'name'     => "W3ImgPlug",
     222        'desc'     => "",
     223        'inherits' => "yes",
     224        'args'     => $arguments };
     225
    124226
    125227sub print_usage {
     
    175277    my $self = new HTMLPlug ($class, @_);
    176278
     279    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     280    my $option_list = $self->{'option_list'};
     281    push( @{$option_list}, $options );
     282
    177283    if (!parsargv::parse(\@_,
    178284             q^aggressiveness/\d/3^, \$self->{'aggressiveness'},
     
    291397    # etc/W3ImgPlug.cfg (XML)
    292398    # tag sets for captions and neartext
    293     if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 10 ) {
     399    if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) {
    294400    $self->{'delims'} = [];
    295401    $self->{'cdelims'} = [];
     
    327433    # get stop words for textual reference extraction
    328434    # TODO: warnings scroll off. Would be best to output them again at end of import
    329     if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 10 ) {
     435    if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) {
    330436    $self->{'stopwords'} = ();
    331437    $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw");
  • trunk/gsdl/perllib/plugins/WordPlug.pm

    r3540 r4744  
    3434}
    3535
    36 my $arguments = [ { 'name' => "process_exp",
    37                           'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    38                           'type' => "string",
    39                           'deft' => q^(?i)\.doc$^,
    40                           'reqd' => "no" } ];
     36my $arguments =
     37    [ { 'name' => "process_exp",
     38    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     39    'type' => "string",
     40    'deft' => &get_default_process_exp(),
     41    'reqd' => "no" } ];
    4142
    4243my $options = { 'name'     => "WordPlug",
    43                  'desc'     => "",
    44                      'inherits' => "yes",
    45                      'args'     => $arguments };
     44        'desc'     => "A plugin for importing Microsoft Word documents.",
     45        'inherits' => "yes",
     46        'args'     => $arguments };
    4647
    4748sub new {
     
    5051    my $self = new ConvertToPlug ($class, @_);
    5152
    52     # 14-05-02 To allow for proper inheritance of arguments - John Thompson
    53     my $option_list = $self->{'option_list'};
    54     push( @{$option_list}, $options );
     53    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     54    my $option_list = $self->{'option_list'};
     55    push( @{$option_list}, $options );
    5556
    5657    # wvWare will always produce html files encoded as utf-8
  • trunk/gsdl/perllib/plugins/XMLPlug.pm

    r3540 r4744  
    3535
    3636use XML::Parser;
    37 my $arguments = [ { 'name' => "process_exp",
    38                           'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
    39                           'type' => "string",
    40                           'deft' => q^(?i)\.xml$^,
    41                           'reqd' => "no" } ];
     37
     38my $arguments =
     39    [ { 'name' => "process_exp",
     40    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).",
     41    'type' => "string",
     42    'deft' => &get_default_process_exp(),
     43    'reqd' => "no" } ];
    4244
    4345my $options = { 'name'     => "XMLPlug",
    44                  'desc'     => "",
    45                      'inherits' => "yes",
    46                      'args'     => $arguments };
     46        'desc'     => "Base class for XML plugins.",
     47        'inherits' => "yes",
     48        'args'     => $arguments };
    4749
    4850
  • trunk/gsdl/perllib/plugins/ZIPPlug.pm

    r3540 r4744  
    5858
    5959my $options = { 'name'     => "ZIPPlug",
    60                  'desc'     => "Plugin which handles compressed and/or archived input formats currently handled formats and file extensions are:\ngzip (.gz, .z, .tgz, .taz)\nbzip (.bz)\nbzip2 (.bz2)\nzip (.zip .jar)\ntar (.tar)\n\nThis plugin relies on the following utilities being present (if trying to process the corresponding formats):\ngunzip (for gzip)\nbunzip (for bzip)\nbunzip2 \nunzip (for zip)\ntar (for tar)",
    61                      'inherits' => "yes" };
     60        'desc'     => "Plugin which handles compressed and/or archived input formats currently handled formats and file extensions are:\ngzip (.gz, .z, .tgz, .taz)\nbzip (.bz)\nbzip2 (.bz2)\nzip (.zip .jar)\ntar (.tar)\n\nThis plugin relies on the following utilities being present (if trying to process the corresponding formats):\ngunzip (for gzip)\nbunzip (for bzip)\nbunzip2 \nunzip (for zip)\ntar (for tar)",
     61        'inherits' => "yes" };
    6262
    6363sub new {
     
    6565    my $self = new BasPlug ("ZIPPlug", @_);
    6666
    67     # 14-05-02 To allow for proper inheritance of arguments - John Thompson
    68     my $option_list = $self->{'option_list'};
    69     push( @{$option_list}, $options );
     67    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     68    my $option_list = $self->{'option_list'};
     69    push( @{$option_list}, $options );
    7070
    7171    return bless $self, $class;
Note: See TracChangeset for help on using the changeset viewer.