Ignore:
Timestamp:
2002-11-18T17:43:56+13:00 (21 years ago)
Author:
kjdon
Message:

added John T's changes into CVS - added info to enable retrieval of usage info in xml

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r3515 r3540  
    4242use ghtml;
    4343
     44my $unicode_list =
     45[ { 'name' => "auto",
     46    'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } ,
     47  { 'name' => "ascii",
     48    'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." },
     49  { 'name' => "utf8",
     50    'desc' => "either utf8 or unicode -- automatically detected." },
     51  { 'name' => "unicode",
     52    'desc' => "just unicode" } ];
     53
     54my $arguments =
     55    [ { 'name' => "process_exp",
     56    'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",
     57    'type' => "string",
     58    'deft' => "",
     59    'reqd' => "no" },
     60      { 'name' => "block_exp",
     61    'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
     62    'type' => 'string',
     63    'deft' => "",
     64    'reqd' => "no" },
     65      { 'name' => "input_encoding",
     66    'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",
     67    'type' => "enum",
     68    'list' => $unicode_list,
     69    'reqd' => "no" ,
     70    'deft' => "auto" } ,
     71      { 'name' => "default_encoding",
     72    'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.  The default is iso_8859_1.",
     73    'type' => "flag",
     74    'reqd' => "no" },
     75      { 'name' => "extract_language",
     76    'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.",
     77    'type' => "flag",
     78    'reqd' => "no" },
     79      { 'name' => "default_language",
     80    'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.",
     81    'type' => "language",
     82    'deft' => "en",
     83    'reqd' => "no" },
     84      { 'name' => "extract_acronyms",
     85    'desc' => "Extract acronyms from within text and set as metadata.",
     86    'type' => "flag",
     87    'reqd' => "no" },
     88      { 'name' => "markup_acronyms",
     89    'desc' => "Add acronym metadata into document text.",
     90    'type' => "flag",
     91    'reqd' => "no" },
     92      { 'name' => "first",
     93    'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.",
     94    'type' => "string",
     95    'reqd' => "no" },
     96      { 'name' => "extract_email",
     97    'desc' => "Extract email addresses as metadata.",
     98    'type' => "flag",
     99    'reqd' => "no" },
     100      { 'name' => "extract_historical_years",
     101    'desc' => "Extract time-period information from historical documents.  This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.",
     102    'type' => "flag",
     103    'reqd' => "no" },
     104      { 'name' => "maximum_year",
     105    'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).",
     106    'type' => "int",
     107    'reqd' => "no"},
     108      { 'name' => "maximum_century",
     109    'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).",
     110    'type' => "int",
     111    'reqd' => "no" },
     112      { 'name' => "no_bibliography",
     113    'desc' => "Do not try and block bibliographic dates when extracting historical dates.",
     114    'type' => "flag",
     115    'reqd' => "no"},
     116      { 'name' => "cover_image",
     117    'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.",
     118    'type' => "flag",
     119    'reqd' => "no" } ];
     120
     121my $options = { 'name'     => "BasPlug",
     122        'desc'     => "Base class for all the import plugins.",
     123        'inherits' => "No",
     124        'args'     => $arguments,
     125        'process_exp' => "",
     126        'block_exp' => "" };
     127
     128sub print_xml_usage {
     129    my $self = shift (@_);
     130    print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n";
     131    $self->print_xml();
     132}
     133
     134sub print_xml {
     135    my $self = shift (@_);
     136    my $option_list = $self->{'option_list'};
     137    my $option = pop( @{$option_list} );
     138    if(defined $option)
     139    {
     140    print STDERR "<PlugInfo>\n";
     141    print STDERR "  <Name>$option->{'name'}</Name>\n";
     142    print STDERR "  <Desc>$option->{'desc'}</Desc>\n";
     143    print STDERR "  <Inherits>$option->{'inherits'}</Inherits>\n";
     144    print STDERR "  <Arguments>\n";
     145    if(defined $option->{'args'})
     146    {
     147        my $args = $option->{'args'};
     148        my $x;
     149        foreach $x ( @{$args} )
     150        {
     151        print STDERR "    <Option>\n";
     152        print STDERR "      <Name>$x->{'name'}</Name>\n";
     153        print STDERR "      <Desc>$x->{'desc'}</Desc>\n";
     154        print STDERR "      <Type>$x->{'type'}</Type>\n";
     155        print STDERR "      <Required>$x->{'reqd'}</Required>\n";
     156        if(defined $x->{'list'})
     157        {
     158            print STDERR "      <List>\n";
     159            my $list = $x->{'list'};
     160            my $y;
     161            foreach $y ( @{$list} )
     162            {
     163            print STDERR "        <Value>\n";
     164            print STDERR "          <Name>$y->{'name'}</Name>\n";
     165            print STDERR "          <Desc>$y->{'desc'}</Desc>\n";
     166            print STDERR "        </Value>\n";
     167            }
     168            # Special case of 'input_encoding'
     169            if( $x->{'name'} =~ m/^input_encoding$/i ) {
     170            my $e = $encodings::encodings;
     171            foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) {
     172                print STDERR "        <Value>\n";
     173                print STDERR "          <Name>$enc</Name>\n";
     174                print STDERR "          <Desc>$e->{$enc}->{'name'}</Desc>\n";
     175                print STDERR "        </Value>\n";
     176            }
     177            }
     178            print STDERR "      </List>\n";
     179        }
     180        if(defined $x->{'deft'})
     181        {
     182            print STDERR "      <Default>$x->{'deft'}</Default>\n";
     183        }
     184        print STDERR "    </Option>\n";
     185        }
     186    }
     187    if(defined $option_list) {
     188        $self->print_xml();
     189    }
     190   
     191    print STDERR "  </Arguments>\n";
     192    print STDERR "</PlugInfo>\n";
     193    }
     194}
     195
    44196sub print_general_usage {
    45197    my ($plugin_name) = @_;
     
    154306    $self->{'num_blocked'} = 0;
    155307    $self->{'num_archives'} = 0;
     308   
     309    # 14-05-02 To allow for proper inheritance of arguments - John Thompson
     310    $self->{'option_list'} = [ $options ];
    156311   
    157312    # general options available to all plugins
Note: See TracChangeset for help on using the changeset viewer.