Changeset 4744
- Timestamp:
- 2003-06-20T14:22:34+12:00 (21 years ago)
- Location:
- trunk/gsdl/perllib/plugins
- Files:
-
- 30 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/ArcPlug.pm
r3540 r4744 41 41 } 42 42 43 my $options = 44 { 'name' => "ArcPlug", 45 'desc' => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.", 46 'inherits' => "Yes" }; 43 my $options = { 'name' => "ArcPlug", 44 'desc' => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.", 45 'inherits' => "Yes" }; 47 46 48 47 sub new { -
trunk/gsdl/perllib/plugins/BasPlug.pm
r3834 r4744 45 45 46 46 my $unicode_list = 47 [ { 'name' => "auto",48 49 { 'name' => "ascii",50 51 { 'name' => "utf8",52 53 { 'name' => "unicode",54 47 [ { 'name' => "auto", 48 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } , 49 { 'name' => "ascii", 50 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." }, 51 { 'name' => "utf8", 52 'desc' => "either utf8 or unicode -- automatically detected." }, 53 { 'name' => "unicode", 54 'desc' => "just unicode" } ]; 55 55 56 56 my $arguments = 57 57 [ { 'name' => "process_exp", 58 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e.all documents ending in .htm or .html (case-insensitive).",58 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 59 59 'type' => "string", 60 60 'deft' => "", … … 66 66 'reqd' => "no" }, 67 67 { 'name' => "input_encoding", 68 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.",68 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8.", 69 69 'type' => "enum", 70 70 'list' => $unicode_list, … … 72 72 'deft' => "auto" } , 73 73 { 'name' => "default_encoding", 74 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone. The default is iso_8859_1.", 75 'type' => "flag", 76 'reqd' => "no" }, 74 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone.", 75 'type' => "enum", 76 'reqd' => "no", 77 'deft' => "utf8" }, 77 78 { 'name' => "extract_language", 78 79 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.", … … 107 108 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).", 108 109 'type' => "int", 110 'deft' => (localtime)[5]+1900, 109 111 'reqd' => "no"}, 110 112 { 'name' => "maximum_century", 111 113 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).", 112 114 'type' => "int", 115 'deft' => "-1", 113 116 'reqd' => "no" }, 114 117 { 'name' => "no_bibliography", … … 196 199 } 197 200 201 202 # sub print_usage_new 203 # { 204 # } 205 206 207 sub print_usage_new 208 { 209 local $self = shift(@_); 210 local $optionlist = $self->{'option_list'}; 211 local $pluginoptions = pop(@$optionlist); 212 return if (!defined($pluginoptions)); 213 214 local $pluginname = $pluginoptions->{'name'}; 215 local $pluginargs = $pluginoptions->{'args'}; 216 217 # Produce the usage information using the data structure above 218 print STDERR " usage: plugin $pluginname"; 219 if (defined($pluginargs)) { 220 print STDERR " [options]"; 221 } 222 print STDERR "\n\n"; 223 224 # Display the plugin options, if there are some 225 if (defined($pluginargs)) { 226 # Find the length of the longest option string 227 local $maxlength = 0; 228 foreach $option (@$pluginargs) { 229 local $optionname = $option->{'name'}; 230 local $optiontype = $option->{'type'}; 231 232 local $optionstringlength = length($optionname); 233 if ($optiontype ne "flag") { 234 $optionstringlength = $optionstringlength + 3 + length($optiontype); 235 } 236 237 # Remember the longest 238 if ($optionstringlength > $maxlength) { 239 $maxlength = $optionstringlength; 240 } 241 } 242 243 # Calculate the column offset of the option descriptions 244 local $optiondescoffset = 3 + $maxlength + 2; 245 246 # Display the plugin options 247 print STDERR " options:\n"; 248 foreach $option (@$pluginargs) { 249 # Display option name 250 local $optionname = $option->{'name'}; 251 print STDERR " -$optionname"; 252 local $optionstringlength = 3 + length($optionname); 253 254 # Display option type, if the option is not a flag 255 local $optiontype = $option->{'type'}; 256 if ($optiontype ne "flag") { 257 print STDERR " <$optiontype>"; 258 $optionstringlength = $optionstringlength + (2 + length($optiontype) + 1); 259 } 260 261 # Display the option description 262 local $optiondesc = $option->{'desc'}; 263 &display_text_in_column($optiondesc, $optiondescoffset, $optionstringlength, 80); 264 265 # Show the default value for the option, if there is one 266 local $optiondefault = $option->{'deft'}; 267 if (defined($optiondefault)) { 268 print STDERR " " x $optiondescoffset; 269 print STDERR "Default: " . $optiondefault . "\n"; 270 } 271 272 # If the option has a list of possible values, display these 273 local $optionvalueslist = $option->{'list'}; 274 if (defined($optionvalueslist)) { 275 print STDERR "\n"; 276 foreach $optionvalue (@$optionvalueslist) { 277 local $optionvaluename = $optionvalue->{'name'}; 278 print STDERR " " x $optiondescoffset; 279 print STDERR "$optionvaluename:"; 280 281 local $optionvaluedesc = $optionvalue->{'desc'}; 282 &display_text_in_column($optionvaluedesc, ($optiondescoffset + 2), 283 $optiondescoffset + length($optionvaluename), 80); 284 } 285 } 286 287 # Special case for 'input_encoding' 288 if ($optionname =~ m/^input_encoding$/i) { 289 my $e = $encodings::encodings; 290 foreach $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { 291 local $encodingname = $enc; 292 print STDERR " " x $optiondescoffset; 293 print STDERR "$enc:"; 294 295 local $encodingdesc = $e->{$enc}->{'name'}; 296 &display_text_in_column($encodingdesc, ($optiondescoffset + 2), 297 $optiondescoffset + length($encodingname), 80); 298 } 299 } 300 301 # Add a blank line to separate options 302 print STDERR "\n"; 303 } 304 } 305 306 # If the plugin inherits from another, do the parent now 307 if (defined($optionlist)) { 308 $self->print_usage_new(); 309 } 310 } 311 312 313 sub display_text_in_column 314 { 315 local ($text, $columnbeg, $firstlineoffset, $columnend) = @_; 316 317 # Spaces are put *before* words, so treat the column beginning as 1 smaller than it is 318 $columnbeg = $columnbeg - 1; 319 320 # Add some padding (if needed) for the first line 321 local $linelength = $columnbeg; 322 if ($firstlineoffset < $columnbeg) { 323 print STDERR " " x ($columnbeg - $firstlineoffset); 324 } 325 else { 326 $linelength = $firstlineoffset; 327 } 328 329 # Break the text into words, and display one at a time 330 local @words = split(/ /, $text); 331 332 foreach $word (@words) { 333 # Unescape '<' and '>' characters 334 $word =~ s/</</g; 335 $word =~ s/>/>/g; 336 337 # If printing this word would except the column end, start a new line 338 if (($linelength + length($word)) >= $columnend) { 339 print STDERR "\n"; 340 print STDERR " " x $columnbeg; 341 $linelength = $columnbeg; 342 } 343 344 # Write the word 345 print STDERR " " . $word; 346 $linelength = $linelength + (length($word) + 1); 347 } 348 349 print STDERR "\n"; 350 } 351 352 198 353 sub print_general_usage { 199 354 my ($plugin_name) = @_; … … 316 471 q^process_exp/.*/^, \$self->{'process_exp'}, 317 472 q^block_exp/.*/^, \$self->{'block_exp'}, 473 q^extract_language^, \$self->{'extract_language'}, 318 474 q^extract_acronyms^, \$self->{'extract_acronyms'}, 319 q^extract_keyphrases^, \$self->{'kea'}, #with extra options 320 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options 475 q^extract_keyphrases^, \$self->{'kea'}, #with extra options (UNDOCUMENTED) 476 q^extract_keyphrase_options/.*/^, \$self->{'kea_options'}, #no extra options (UNDOCUMENTED) 321 477 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 322 478 qq^default_encoding/$denc/utf8^, \$self->{'default_encoding'}, -
trunk/gsdl/perllib/plugins/BibTexPlug.pm
r3587 r4744 52 52 53 53 my $arguments = 54 [ {'name' => "process_exp",55 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e.all documents ending in .htm or .html (case-insensitive).",54 [ { 'name' => "process_exp", 55 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 56 56 'type' => "string", 57 57 'reqd' => "no" , 58 'deft' => q^(?i)\.bib$^ } 59 ]; 60 61 my $options = 62 { 'name' => "BibTexPlug", 63 'desc' => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.", 64 'inherits' => "Yes", 65 'args' => $arguments }; 58 'deft' => q^(?i)\.bib$^ } ]; 59 60 my $options = { 'name' => "BibTexPlug", 61 'desc' => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.", 62 'inherits' => "Yes", 63 'args' => $arguments }; 66 64 67 65 # This plugin processes files with the suffix ".bib" -
trunk/gsdl/perllib/plugins/BookPlug.pm
r3540 r4744 61 61 62 62 my $arguments = 63 [ {'name' => "process_exp",64 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e.all documents ending in .htm or .html (case-insensitive).",63 [ { 'name' => "process_exp", 64 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 65 65 'type' => "string", 66 66 'reqd' => "no", 67 'deft' => q^(?i)\.hb$^},68 {'name' => "block_exp",67 'deft' => &get_default_process_exp() }, 68 { 'name' => "block_exp", 69 69 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 70 70 'type' => "string", 71 71 'reqd' => "no", 72 'deft' => q^\.jpg$^} 73 ]; 74 75 my $options = 76 { 'name' => "BookPlug", 77 'desc' => "Creates multi-level document from document containing <<TOC>> level tags. Metadata for each section is taken from any other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>> sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around <<TOC>> tags, uses <<I>> tags to specify images, and simply takes all text between <<TOC>> tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.", 78 'inherits' => "Yes", 79 'args' => $arguments }; 72 'deft' => &get_default_block_exp() } ]; 73 74 my $options = { 'name' => "BookPlug", 75 'desc' => "Creates multi-level document from document containing <<TOC>> level tags. Metadata for each section is taken from any other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>> sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around <<TOC>> tags, uses <<I>> tags to specify images, and simply takes all text between <<TOC>> tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.", 76 'inherits' => "Yes", 77 'args' => $arguments }; 80 78 81 79 sub new { -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r3720 r4744 50 50 51 51 my $convert_to_list = 52 [ { 'name' => "html", 53 'desc' => "" }, 54 { 'name' => "text", 55 'desc' => "" } 56 ]; 52 [ { 'name' => "html", 53 'desc' => "HTML format" }, 54 { 'name' => "text", 55 'desc' => "Plain text format" } ]; 57 56 58 57 my $arguments = 59 [ {'name' => "convert_to",60 'desc' => "Plugin converts to TEXT or HTML (default html).",58 [ { 'name' => "convert_to", 59 'desc' => "Plugin converts to TEXT or HTML.", 61 60 'type' => "enum", 62 61 'reqd' => "no", 63 62 'list' => $convert_to_list, 64 'deft' => "html"} 65 ]; 66 67 my $options = 68 { 'name' => "ConvertToPlug", 69 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.", 70 'inherits' => "Yes", 71 'args' => $arguments }; 63 'deft' => "html" }, 64 { 'name' => "use_strings", 65 'desc' => "If set, a simple strings function will be called to extract text if the conversion utility fails.", 66 'type' => "flag", 67 'reqd' => "no" } ]; 68 69 my $options = { 'name' => "ConvertToPlug", 70 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.", 71 'inherits' => "Yes", 72 'args' => $arguments }; 72 73 73 74 … … 100 101 101 102 if (!parsargv::parse($args, 102 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options 103 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options 103 q^extract_keyphrases^, \$newargs->{'kea'}, #with extra options (undocumented) 104 q^extract_keyphrase_options/.*/^, \$newargs->{'kea_options'}, #no extra options (undocumented) 104 105 q^convert_to/(html|text)/html^, \$newargs->{'generate_format'}, 105 106 q^use_strings^, \$newargs->{'use_strings'}, … … 117 118 sub new { 118 119 my $class = shift (@_); 119 if ($class eq "ConvertToPlug") {$class = shift (@_);} 120 # print "Class: " . $class . "\n"; 121 # if ($class eq "ConvertToPlug") {$class = shift (@_);} 120 122 my $self; 121 123 # parsargv::parse might modify the list, so we do this by creating a copy -
trunk/gsdl/perllib/plugins/ConvertToRogPlug.pm
r3737 r4744 35 35 @ISA = ('RogPlug'); 36 36 } 37 38 my $options = { 'name' => "ConvertToRogPlug", 39 'desc' => "A plugin that inherits from RogPlug.", 40 'inherits' => "Yes" }; 37 41 38 42 sub print_usage { … … 71 75 sub new { 72 76 my $class = shift (@_); 73 if ($class eq "ConvertToRogPlug") {$class = shift (@_);} 77 # print "Class: " . $class . "\n"; 78 # if ($class eq "ConvertToRogPlug") {$class = shift (@_);} 74 79 my $self; 75 80 # parsargv::parse might modify the list, so we do this by creating a copy … … 81 86 $self->{'convert_to'} = "Rog"; 82 87 $self->{'convert_to_ext'} = "rog"; 88 89 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 90 my $option_list = $self->{'option_list'}; 91 push( @{$option_list}, $options ); 83 92 84 93 return bless $self, $class; -
trunk/gsdl/perllib/plugins/DBPlug.pm
r4429 r4744 44 44 } 45 45 46 my $arguments = [ { 'name' => "process_exp", 47 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 48 'type' => "string", 49 'deft' => q^(?i)\.dbi$^, 50 'reqd' => "no" } , 51 { 'name' => "title_sub", 52 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.", 53 'type' => "string", 54 'reqd' => "no" }]; 46 my $arguments = 47 [ { 'name' => "process_exp", 48 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 49 'type' => "string", 50 'deft' => &get_default_process_exp(), 51 'reqd' => "no" } , 52 { 'name' => "title_sub", 53 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.", 54 'type' => "string", 55 'deft' => "", 56 'reqd' => "no" }]; 55 57 56 58 my $options = { 'name' => "DBPlug", 57 58 59 59 'desc' => "Uses records from a database as documents.", 60 'inherits' => "yes", 61 'args' => $arguments }; 60 62 61 63 sub print_usage { -
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r4224 r4744 90 90 91 91 my $arguments = 92 [ {'name' => "process_exp",93 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e.all documents ending in .htm or .html (case-insensitive).",92 [ { 'name' => "process_exp", 93 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 94 94 'type' => "string", 95 95 'reqd' => "no", 96 'deft' => q@([\\/]\d+|\.(mbx|email|eml))$@ 97 }, 98 { 'name' => "no_attachments", 96 'deft' => &get_default_process_exp() }, 97 { 'name' => "no_attachments", 99 98 'desc' => "Do not save message attachments.", 100 99 'type' => "flag", 101 'reqd' => "no" 102 }, 103 { 'name' => "block_exp", 104 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 100 'reqd' => "no" }, 101 { 'name' => "split_exp", 102 'desc' => "A perl regular expression used to split files containing many messages into individual documents.", 105 103 'type' => "string", 106 'reqd' => "no", 107 'deft' => q^^} 108 ]; 109 110 my $options = 111 { 'name' => "EMAILPlug", 112 'desc' => "Email plug reads email files. These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers All the header content\n\t\$Subject Subject: header\n\t\$To To: header\n\t\$From From: header\n\t\$FromName Name of sender (where available)\n\t\$FromAddr E-mail address of sender\n\t\$DateText Date: header\n\t\$Date Date: header in GSDL format (eg: 19990924)", 113 'inherits' => "Yes", 114 'args' => $arguments }; 104 'deft' => "" } ]; 105 106 my $options = { 'name' => "EMAILPlug", 107 'desc' => "Email plug reads email files. These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers All the header content\n\t\$Subject Subject: header\n\t\$To To: header\n\t\$From From: header\n\t\$FromName Name of sender (where available)\n\t\$FromAddr E-mail address of sender\n\t\$DateText Date: header\n\t\$Date Date: header in GSDL format (eg: 19990924)", 108 'inherits' => "Yes", 109 'args' => $arguments }; 115 110 116 111 # Create a new EMAILPlug object with which to parse a file. -
trunk/gsdl/perllib/plugins/ExcelPlug.pm
r2990 r4744 34 34 } 35 35 36 my $options = { 'name' => "ExcelPlug", 37 'desc' => "A plugin for importing Microsoft Excel files.", 38 'inherits' => "Yes" }; 39 36 40 sub new { 37 41 my $class = shift (@_); … … 45 49 # $self->{'input_encoding'} = "utf8"; 46 50 # } 51 52 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 53 my $option_list = $self->{'option_list'}; 54 push( @{$option_list}, $options ); 47 55 48 56 return bless $self, $class; -
trunk/gsdl/perllib/plugins/FOXPlug.pm
r3540 r4744 38 38 use unicode; 39 39 use cnseg; 40 use gb;40 # use gb; 41 41 42 42 -
trunk/gsdl/perllib/plugins/HBPlug.pm
r3542 r4744 73 73 74 74 $self->BasPlug::init($verbosity, $outhandle); 75 $self->{'input_encoding'} = "iso_8859_1"; 75 76 76 77 # this plugin only handles ascii encodings -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r3708 r4744 48 48 } 49 49 50 my $arguments = [ { 'name' => "process_exp", 51 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 52 'type' => "string", 53 'deft' => q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php|\.cgi|.+\?.+=.*)$^ }, 54 { 'name' => "block_exp", 55 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 56 'type' => 'string', 57 'deft' => q^(?i)\.(gif|jpe?g|png|css)$^ }, 58 { 'name' => "nolinks", 59 'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).", 60 'type' => "flag" }, 61 { 'name' => "keep_head", 62 'desc' => "Don't remove headers from html files.", 63 'type' => "flag" }, 64 { 'name' => "no_metadata", 65 'desc' => "Don't attempt to extract any metadata from files.", 66 'type' => "flag" }, 67 { 'name' => "metadata_fields", 68 'desc' => "Comma separated list of metadata fields to attempt to extract. Defaults to 'Title'. Use 'tag<tagname>' to have the contents of the first <tagname > pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.", 69 'type' => "metadatum", 70 'deft' => "" }, 71 { 'name' => "hunt_creator_metadata", 72 'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.", 73 'type' => "flag" }, 74 { 'name' => "file_is_url", 75 'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.", 76 'type' => "flag" }, 77 { 'name' => "assoc_files", 78 'desc' => "Perl regular expression of file extensions to associate with html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'", 79 'type' => "string", 80 'deft' => q^(?i)\.(jpe?g|gif|png|css)\$^ }, 81 { 'name' => "rename_assoc_files", 82 'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).", 83 'type' => "flag" } , 84 { 'name' => "title_sub", 85 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.", 86 'type' => "string" } , 87 { 'name' => "description_tags", 88 'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.", 89 'type' => "flag" } ]; 50 my $arguments = 51 [ { 'name' => "process_exp", 52 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 53 'type' => "string", 54 'deft' => &get_default_process_exp() }, 55 { 'name' => "block_exp", 56 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 57 'type' => 'string', 58 'deft' => &get_default_block_exp() }, 59 { 'name' => "nolinks", 60 'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).", 61 'type' => "flag" }, 62 { 'name' => "keep_head", 63 'desc' => "Don't remove headers from html files.", 64 'type' => "flag" }, 65 { 'name' => "no_metadata", 66 'desc' => "Don't attempt to extract any metadata from files.", 67 'type' => "flag" }, 68 { 'name' => "metadata_fields", 69 'desc' => "Comma separated list of metadata fields to attempt to extract. Use 'tag<tagname>' to have the contents of the first <tagname> pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.", 70 'type' => "metadatum", 71 'deft' => "Title" }, 72 { 'name' => "hunt_creator_metadata", 73 'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.", 74 'type' => "flag" }, 75 { 'name' => "file_is_url", 76 'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.", 77 'type' => "flag" }, 78 { 'name' => "assoc_files", 79 'desc' => "Perl regular expression of file extensions to associate with html documents.", 80 'type' => "string", 81 'deft' => q^(?i)\.(jpe?g|gif|png|css)$^ }, 82 { 'name' => "rename_assoc_files", 83 'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).", 84 'type' => "flag" }, 85 { 'name' => "title_sub", 86 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.", 87 'type' => "string", 88 'deft' => "" }, 89 { 'name' => "description_tags", 90 'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.", 91 'type' => "flag" } ]; 90 92 91 93 my $options = { 'name' => "HTMLPlug", … … 94 96 'args' => $arguments }; 95 97 96 sub print_usage { 97 print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; 98 print STDERR " options:\n"; 99 print STDERR " -nolinks Don't make any attempt to trap links (setting this\n"; 100 print STDERR " flag may improve speed of building/importing but\n"; 101 print STDERR " any relative links within documents will be broken).\n"; 102 print STDERR " -keep_head Don't remove headers from html files.\n"; 103 print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n"; 104 print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to 105 extract. Defaults to 'Title'. 106 Use 'tag<tagname>' to have the contents of the first 107 <tagname> pair put in a metadata element called 108 'tagname'. Capitalise this as you want the metadata 109 capitalised in Greenstone, since the tag extraction 110 is case insensitive.\n"; 111 print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and 112 place it in the 'Creator' field. Requires the 113 -metadata_fields flag.\n"; 114 print STDERR " -file_is_url Set if input filenames make up url of original source 115 documents e.g. if a web mirroring tool was used to 116 create the import directory structure\n"; 117 print STDERR " -assoc_files Perl regular expression of file extensions to 118 associate with html documents. 119 Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n"; 120 print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). 121 Also creates much shallower directory structure 122 (useful when creating collections to go on cd-rom).\n"; 123 print STDERR " -title_sub Substitution expression to modify string stored as 124 Title. Used by, for example, PDFPlug to remove 125 \"Page 1\", etc from text used as the title.\n"; 126 print STDERR " -description_tags Split document into sub-sections where <Section> tags 127 occur. Note that by setting this option you 128 implicitly set -no_metadata, as all metadata should 129 be included within the <Section> tags (this is only 130 true for documents that actually contain <Section> tags 131 however). Also, '-keep_head' will have no effect when 132 this option is set, regardless of whether a document 133 contains Section tags.\n"; 134 } 98 99 # sub print_usage { 100 # print STDERR "\n usage: plugin HTMLPlug [options]\n\n"; 101 # print STDERR " options:\n"; 102 # print STDERR " -nolinks Don't make any attempt to trap links (setting this\n"; 103 # print STDERR " flag may improve speed of building/importing but\n"; 104 # print STDERR " any relative links within documents will be broken).\n"; 105 # print STDERR " -keep_head Don't remove headers from html files.\n"; 106 # print STDERR " -no_metadata Don't attempt to extract any metadata from files.\n"; 107 # print STDERR " -metadata_fields Comma separated list of metadata fields to attempt to 108 # extract. Defaults to 'Title'. 109 # Use 'tag<tagname>' to have the contents of the first 110 # <tagname> pair put in a metadata element called 111 # 'tagname'. Capitalise this as you want the metadata 112 # capitalised in Greenstone, since the tag extraction 113 # is case insensitive.\n"; 114 # print STDERR " -hunt_creator_metadata Find as much metadata as possible on authorship and 115 # place it in the 'Creator' field. Requires the 116 # -metadata_fields flag.\n"; 117 # print STDERR " -file_is_url Set if input filenames make up url of original source 118 # documents e.g. if a web mirroring tool was used to 119 # create the import directory structure\n"; 120 # print STDERR " -assoc_files Perl regular expression of file extensions to 121 # associate with html documents. 122 # Defaults to '(?i)\.(jpe?g|gif|png|css)\$'\n"; 123 # print STDERR " -rename_assoc_files Renames files associated with documents (e.g. images). 124 # Also creates much shallower directory structure 125 # (useful when creating collections to go on cd-rom).\n"; 126 # print STDERR " -title_sub Substitution expression to modify string stored as 127 # Title. Used by, for example, PDFPlug to remove 128 # \"Page 1\", etc from text used as the title.\n"; 129 # print STDERR " -description_tags Split document into sub-sections where <Section> tags 130 # occur. Note that by setting this option you 131 # implicitly set -no_metadata, as all metadata should 132 # be included within the <Section> tags (this is only 133 # true for documents that actually contain <Section> tags 134 # however). Also, '-keep_head' will have no effect when 135 # this option is set, regardless of whether a document 136 # contains Section tags.\n"; 137 # } 135 138 136 139 sub new { -
trunk/gsdl/perllib/plugins/ImagePlug.pm
r4724 r4744 32 32 } 33 33 34 35 36 my $arguments = [ { 'name' => "process_exp", 37 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 38 'type' => "string", 39 'deft' => q^(?i)(\.jpe?g|\.gif|\.png|\.bmp|\.xbm|\.tif?f)$^, 40 'reqd' => "no" }, 41 { 'name' => "noscaleup", 42 'desc' => "Don't scale up small images when making thumbnails.", 43 'type' => "flag", 44 'reqd' => "no" }, 45 { 'name' => "thumbnailsize", 46 'desc' => "Make thumbnails of size nxn.", 47 'type' => "int", 48 'reqd' => "no" }, 49 { 'name' => "thumbnailtype", 50 'desc' => "Make thumbnails in format 's'.", 51 'type' => "string", 52 'reqd' => "no" }, 53 { 'name' => "screenviewsize", 54 'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata. By default it is not set.", 55 'type' => "int", 56 'reqd' => "no" }, 57 { 'name' => "screenviewtype", 58 'desc' => "If -screenviewsize is set, this sets the screen display image type. Defaults to jpg.", 59 'type' => "string", 60 'deft' => "jpg", 61 'reqd' => "no" }, 62 { 'name' => "convertto", 63 'desc' => "Convert main image to.", 64 'type' => "string", 65 'reqd' => "no" }, 66 { 'name' => "minimumsize", 67 'desc' => "Ignore images smaller than n bytes.", 68 'type' => "int", 69 'reqd' => "no" } ]; 34 my $arguments = 35 [ { 'name' => "process_exp", 36 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 37 'type' => "string", 38 'deft' => &get_default_process_exp(), 39 'reqd' => "no" }, 40 { 'name' => "noscaleup", 41 'desc' => "Don't scale up small images when making thumbnails.", 42 'type' => "flag", 43 'reqd' => "no" }, 44 { 'name' => "thumbnailsize", 45 'desc' => "Make thumbnails of size nxn.", 46 'type' => "int", 47 'deft' => "100", 48 'reqd' => "no" }, 49 { 'name' => "thumbnailtype", 50 'desc' => "Make thumbnails in format 's'.", 51 'type' => "string", 52 'deft' => "gif", 53 'reqd' => "no" }, 54 { 'name' => "screenviewsize", 55 'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata. By default it is not set.", 56 'type' => "int", 57 'deft' => "0", 58 'reqd' => "no" }, 59 { 'name' => "screenviewtype", 60 'desc' => "If -screenviewsize is set, this sets the screen display image type.", 61 'type' => "string", 62 'deft' => "jpg", 63 'reqd' => "no" }, 64 { 'name' => "converttotype", 65 'desc' => "Convert main image to.", 66 'type' => "string", 67 'deft' => "", 68 'reqd' => "no" }, 69 { 'name' => "minimumsize", 70 'desc' => "Ignore images smaller than n bytes.", 71 'type' => "int", 72 'deft' => "100", 73 'reqd' => "no" } ]; 70 74 71 75 my $options = { 'name' => "ImagePlug", -
trunk/gsdl/perllib/plugins/MACROPlug.pm
r3856 r4744 34 34 } 35 35 36 my $arguments = [ { 'name' => "process_exp", 37 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 38 'type' => "string", 39 'deft' => q^(?i)\.dm$^, 40 'reqd' => "no" }]; 36 my $arguments = 37 [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => &get_default_process_exp(), 41 'reqd' => "no" } ]; 41 42 42 43 my $options = { 'name' => "MACROPlug", … … 100 101 push( @{$option_list}, $options ); 101 102 102 $self->{'lang_abbr'} = load_language_table();103 # $self->{'lang_abbr'} = load_language_table(); 103 104 104 105 return bless $self, $class; -
trunk/gsdl/perllib/plugins/MARCPlug.pm
r3508 r4744 37 37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); 38 38 } 39 40 my $arguments = 41 [ { 'name' => "metadata_mapping", 42 'desc' => "Name of file that includes mapping details from MARC values to Greenstone metadata names. Defaults to 'marctodc.txt' found in the site's etc directory.", 43 'type' => "string", 44 'deft' => "marctodc.txt", 45 'reqd' => "no" } ]; 46 47 my $options = { 'name' => "MARCPlug", 48 'desc' => "", 49 'inherits' => "Yes", 50 'args' => $arguments }; 39 51 40 52 use MARC::Record; … … 67 79 68 80 $self->{'mm_file'} = $metadata_mapping; # relative to etc dir 81 82 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 83 my $option_list = $self->{'option_list'}; 84 push( @{$option_list}, $options ); 69 85 70 86 return bless $self, $class; -
trunk/gsdl/perllib/plugins/PDFPlug.pm
r4103 r4744 32 32 } 33 33 34 my $arguments = [ { 'name' => "process_exp", 35 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 36 'type' => "string", 37 'deft' => q^(?i)\.pdf$^, 38 'reqd' => "no" }, 39 { 'name' => "block_exp", 40 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 41 'type' => 'string', 42 'deft' => q^^ }, 43 { 'name' => "noimages", 44 'desc' => "Don't attempt to extract images from PDF.", 45 'type' => "flag" }, 46 { 'name' => "complex", 47 'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).", 48 'type' => "flag" }, 49 { 'name' => "nohidden", 50 'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.", 51 'type' => "flag" }, 52 { 'name' => "zoom", 53 'desc' => "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).", 54 'type' => "int" } 55 ]; 34 my $arguments = 35 [ { 'name' => "process_exp", 36 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 37 'type' => "string", 38 'deft' => &get_default_process_exp(), 39 'reqd' => "no" }, 40 { 'name' => "block_exp", 41 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 42 'type' => "string", 43 'deft' => q^^ }, 44 { 'name' => "noimages", 45 'desc' => "Don't attempt to extract images from PDF.", 46 'type' => "flag" }, 47 { 'name' => "complex", 48 'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).", 49 'type' => "flag" }, 50 { 'name' => "nohidden", 51 'desc' => "Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.", 52 'type' => "flag" }, 53 { 'name' => "zoom", 54 'desc' => "The factor by which to zoom the PDF for output (this is only useful if -complex is set).", 55 'deft' => "2", 56 'type' => "int" }, 57 { 'name' => "use_sections", 58 'desc' => "Create a separate section for each page of the PDF file.", 59 'type' => "flag" } ]; 56 60 57 61 my $options = { 'name' => "PDFPlug", -
trunk/gsdl/perllib/plugins/PPTPlug.pm
r2981 r4744 34 34 } 35 35 36 my $options = { 'name' => "PPTPlug", 37 'desc' => "A plugin for importing Microsoft PowerPoint files.", 38 'inherits' => "Yes" }; 39 36 40 sub new { 37 41 my $class = shift (@_); … … 43 47 $self->{'input_encoding'} = "utf8"; 44 48 } 49 50 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 51 my $option_list = $self->{'option_list'}; 52 push( @{$option_list}, $options ); 45 53 46 54 return bless $self, $class; -
trunk/gsdl/perllib/plugins/PSPlug.pm
r3540 r4744 35 35 } 36 36 37 my $arguments = [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => q^(?i)\.ps$^, 41 'reqd' => "no" }, 42 { 'name' => "block_exp", 43 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 44 'type' => 'string', 45 'deft' => q^(?i)\.(eps)$^ } 46 ]; 37 my $arguments = 38 [ { 'name' => "process_exp", 39 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 40 'type' => "string", 41 'deft' => &get_default_process_exp(), 42 'reqd' => "no" }, 43 { 'name' => "block_exp", 44 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 45 'type' => 'string', 46 'deft' => &get_default_block_exp() }, 47 { 'name' => "extract_date", 48 'desc' => "Extract date from PS header.", 49 'type' => "flag" }, 50 { 'name' => "extract_pages", 51 'desc' => "Extract pages from PS header.", 52 'type' => "flag" }, 53 { 'name' => "extract_title", 54 'desc' => "Extract title from PS header.", 55 'type' => "flag" } ]; 47 56 48 57 my $options = { 'name' => "PSPlug", -
trunk/gsdl/perllib/plugins/RTFPlug.pm
r3540 r4744 35 35 } 36 36 37 my $arguments = [ { 'name' => "process_exp",38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).",39 'type' => "string",40 'deft' => q^(?i)\.rtf$^,41 'reqd' => "no" }42 37 my $arguments = 38 [ { 'name' => "process_exp", 39 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 40 'type' => "string", 41 'deft' => &get_default_process_exp(), 42 'reqd' => "no" } ]; 43 43 44 44 my $options = { 'name' => "RTFPlug", -
trunk/gsdl/perllib/plugins/RecPlug.pm
r3540 r4744 106 106 use XML::Parser; 107 107 108 my $arguments = [ { 'name' => "block_exp", 109 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 110 'type' => "string", 111 'deft' => "CVS", 112 'reqd' => "no" }, 113 { 'name' => "use_metadata_files", 114 'desc' => "Read metadata from metadata XML files.", 115 'type' => "flag", 116 'reqd' => "no" } ]; 108 my $arguments = 109 [ { 'name' => "block_exp", 110 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 111 'type' => "string", 112 'deft' => &get_default_block_exp(), 113 'reqd' => "no" }, 114 { 'name' => "use_metadata_files", 115 'desc' => "Read metadata from metadata XML files.", 116 'type' => "flag", 117 'reqd' => "no" } ]; 117 118 118 119 my $options = { 'name' => "RecPlug", 119 'desc' => "RecPlug is a plugin which recurses through directories processing 120 # each file it finds. For detailed comments edit <GSDLHOME>/perllib/plugins/RecPlug.pm .", 121 'inherits' => "yes", 122 'args' => $arguments }; 120 'desc' => "RecPlug is a plugin which recurses through directories processing each file it finds.", 121 'inherits' => "yes", 122 'args' => $arguments }; 123 123 124 124 sub print_usage { -
trunk/gsdl/perllib/plugins/ReferPlug.pm
r3540 r4744 72 72 } 73 73 74 my $arguments = [ { 'name' => "process_exp", 75 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 76 'type' => "string", 77 'deft' => q^(?i)\.bib$^, 78 'reqd' => "no" } ]; 74 my $arguments = 75 [ { 'name' => "process_exp", 76 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 77 'type' => "string", 78 'deft' => &get_default_process_exp(), 79 'reqd' => "no" } ]; 79 80 80 81 my $options = { 'name' => "ReferPlug", -
trunk/gsdl/perllib/plugins/RogPlug.pm
r3737 r4744 36 36 } 37 37 38 my $options = { 'name' => "RogPlug", 39 'desc' => "Creates simple single-level documents from .rog or .mdb files.", 40 'inherits' => "Yes" }; 41 38 42 sub new { 39 43 my ($class) = @_; 40 44 $self = new BasPlug (); 45 46 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 47 my $option_list = $self->{'option_list'}; 48 push( @{$option_list}, $options ); 41 49 42 50 return bless $self, $class; -
trunk/gsdl/perllib/plugins/SRCPlug.pm
r3919 r4744 46 46 } 47 47 48 my $arguments = [ { 'name' => "process_exp", 49 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 50 'type' => "string", 51 'deft' => q^(Makefile.*|README.*|(?i)\.(c|cc|cpp|C|h|hpp|pl|pm|sh))$^, 52 'reqd' => "no" } , 53 { 'name' => "block_exp", 54 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 55 'type' => 'string', 56 'deft' => q^(?i)\.(o|obj|a|so|dll)$^, 57 'reqd' => "no" } , 58 { 'name' => "remove_prefix", 59 'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.", 60 'type' => 'string', 61 'reqd' => "no" } ]; 48 my $arguments = 49 [ { 'name' => "process_exp", 50 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 51 'type' => "string", 52 'deft' => &get_default_process_exp(), 53 'reqd' => "no" } , 54 { 'name' => "block_exp", 55 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 56 'type' => 'string', 57 'deft' => &get_default_block_exp(), 58 'reqd' => "no" } , 59 { 'name' => "remove_prefix", 60 'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.", 61 'type' => 'string', 62 'deft' => "", 63 'reqd' => "no" } ]; 62 64 63 65 my $options = { 'name' => "SRCPlug", 64 66 'desc' => "Filename is currently used for Title ( optionally minus some prefix ). Current languages:\ntext: READMEs/Makefiles\nC/C++ (currently extracts #include statements and C++ class decls)\nPerl (currently only done as text)\nShell (currently only done as text)", 65 66 67 'inherits' => "yes", 68 'args' => $arguments }; 67 69 68 70 sub print_usage { -
trunk/gsdl/perllib/plugins/SplitPlug.pm
r3540 r4744 49 49 } 50 50 51 my $arguments = 52 [ { 'name' => "split_exp", 53 'desc' => "A perl regular expression to split input files into segments.", 54 'type' => "string", 55 'deft' => &get_default_split_exp(), 56 'reqd' => "no" } 57 ]; 58 51 59 my $options = { 'name' => "SplitPlug", 52 'desc' => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly. Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug. See ReferPlug for an example.", 53 'inherits' => "yes" }; 60 'desc' => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly. Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug. See ReferPlug for an example.", 61 'inherits' => "yes", 62 'args' => $arguments }; 54 63 55 64 -
trunk/gsdl/perllib/plugins/TEXTPlug.pm
r3932 r4744 39 39 } 40 40 41 my $arguments = [ { 'name' => "process_exp", 42 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 43 'type' => "string", 44 'deft' => q^(?i)\.te?xt$^, 45 'reqd' => "no" } , 46 { 'name' => "title_sub", 47 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.", 48 'type' => "string", 49 'reqd' => "no" }]; 41 my $arguments = 42 [ { 'name' => "process_exp", 43 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 44 'type' => "string", 45 'deft' => &get_default_process_exp(), 46 'reqd' => "no" } , 47 { 'name' => "title_sub", 48 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.", 49 'type' => "string", 50 'deft' => "", 51 'reqd' => "no" } ]; 50 52 51 53 my $options = { 'name' => "TEXTPlug", 52 53 54 54 'desc' => "Creates simple single-level document. Adds Title metadata of first line of text (up to 100 characters long).", 55 'inherits' => "yes", 56 'args' => $arguments }; 55 57 56 58 sub print_usage { … … 68 70 my $self = new BasPlug ($class, @_); 69 71 70 71 72 72 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 73 my $option_list = $self->{'option_list'}; 74 push( @{$option_list}, $options ); 73 75 74 76 if (!parsargv::parse(\@_, -
trunk/gsdl/perllib/plugins/UnknownPlug.pm
r2883 r4744 63 63 @ISA = ('BasPlug'); 64 64 } 65 66 my $arguments = 67 [ { 'name' => "assoc_field", 68 'desc' => "Name of the metadata field that will hold the associated file's name.", 69 'type' => "string", 70 'deft' => "", 71 'reqd' => "no" } , 72 { 'name' => "file_type", 73 'desc' => "Mime type of the file (e.g. image/gif)", 74 'type' => "string", 75 'deft' => "", 76 'reqd' => "no" } ]; 77 78 my $options = { 'name' => "UnknownPlug", 79 'desc' => "This is a simple Plugin for importing files in formats that Greenstone doesn't know anything about. A fictional document will be created for every such file, and the file itself will be passed to Greenstone as the \"associated file\" of the document.", 80 'inherits' => "yes", 81 'args' => $arguments }; 65 82 66 83 sub print_usage { … … 78 95 my $self = new BasPlug ($class, @_); 79 96 97 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 98 my $option_list = $self->{'option_list'}; 99 push( @{$option_list}, $options ); 100 80 101 if (!parsargv::parse(\@_, 81 102 q^assoc_field/.*/^, \$self->{'assoc_field'}, -
trunk/gsdl/perllib/plugins/W3ImgPlug.pm
r2996 r4744 122 122 @ISA = qw( HTMLPlug ); 123 123 } 124 125 my $aggressiveness_list = 126 [ { 'name' => "1", 127 'desc' => "Filename, path, ALT text only." }, 128 { 'name' => "2", 129 'desc' => "All of 1, plus caption where available." }, 130 { 'name' => "3", 131 'desc' => "All of 2, plus near paragraphs where available." }, 132 { 'name' => "4", 133 'desc' => "All of 3, plus previous headers (<h1>, <h2>...) where available." }, 134 { 'name' => "5", 135 'desc' => "All of 4, plus textual references where available." }, 136 { 'name' => "6", 137 'desc' => "All of 4, plus page metatags (title, keywords, etc)." }, 138 { 'name' => "7", 139 'desc' => "All of 6, 5 and 4 combined." }, 140 { 'name' => "8", 141 'desc' => "All of 7, plus repeat caption, filename, etc (raise ranking of more relevant results)." }, 142 { 'name' => "9", 143 'desc' => "All of 1, plus full text of source page." } ]; 144 145 my $arguments = 146 [ { 'name' => "aggressiveness", 147 'desc' => "Range of related text extraction techniques to use.", 148 'type' => "int", 149 'list' => $aggressiveness_list, 150 'deft' => "3", 151 'reqd' => "no" }, 152 { 'name' => "index_pages", 153 'desc' => "Index the pages along with the images. Otherwise reference the pages at the source URL.", 154 'type' => "flag", 155 'reqd' => "no" }, 156 { 'name' => "no_cache_images", 157 'desc' => "Don't cache images (point to URL of original)", 158 'type' => "flag", 159 'reqd' => "no" }, 160 { 'name' => "min_size", 161 'desc' => "Bytes. Skip images smaller than this.", 162 'type' => "int", 163 'deft' => "2000", 164 'reqd' => "no" }, 165 { 'name' => "min_width", 166 'desc' => "Pixels. Skip images narrower than this.", 167 'type' => "int", 168 'deft' => "50", 169 'reqd' => "no" }, 170 { 'name' => "min_height", 171 'desc' => "Pixels. Skip images shorter than this.", 172 'type' => "int", 173 'deft' => "50", 174 'reqd' => "no" }, 175 { 'name' => "thumb_size", 176 'desc' => "Max thumbnail size. Both width and height.", 177 'type' => "int", 178 'deft' => "100", 179 'reqd' => "no" }, 180 { 'name' => "convert_params", 181 'desc' => "Additional parameters for ImageMagicK convert on thumbnail creation. For example, '-raise' will give a three dimensional effect to thumbnail images.", 182 'type' => "string", 183 'deft' => "", 184 'reqd' => "no" }, 185 { 'name' => "min_near_text", 186 'desc' => "Minimum characters of near text or caption to extract.", 187 'type' => "int", 188 'deft' => "10", 189 'reqd' => "no" }, 190 { 'name' => "max_near_text", 191 'desc' => "Maximum characters near images to extract.", 192 'type' => "int", 193 'deft' => "400", 194 'reqd' => "no" }, 195 { 'name' => "smallpage_threshold", 196 'desc' => "Images on pages smaller than this (bytes) will have the page (title, keywords, etc) meta-data added.", 197 'type' => "int", 198 'deft' => "2048", 199 'reqd' => "no" }, 200 { 'name' => "textrefs_threshold", 201 'desc' => "Threshold for textual references. Lower values mean the algorithm is less strict.", 202 'type' => "int", 203 'deft' => "2", 204 'reqd' => "no" }, 205 { 'name' => "caption_length", 206 'desc' => "Maximum length of captions (in characters).", 207 'type' => "int", 208 'deft' => "80", 209 'reqd' => "no" }, 210 { 'name' => "neartext_length", 211 'desc' => "Target length of near text (in characters).", 212 'type' => "int", 213 'deft' => "300", 214 'reqd' => "no" }, 215 { 'name' => "document_text", 216 'desc' => "Add image text as document:text (otherwise IndexedText metadata field).", 217 'type' => "flag", 218 'reqd' => "no" } 219 ]; 220 221 my $options = { 'name' => "W3ImgPlug", 222 'desc' => "", 223 'inherits' => "yes", 224 'args' => $arguments }; 225 124 226 125 227 sub print_usage { … … 175 277 my $self = new HTMLPlug ($class, @_); 176 278 279 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 280 my $option_list = $self->{'option_list'}; 281 push( @{$option_list}, $options ); 282 177 283 if (!parsargv::parse(\@_, 178 284 q^aggressiveness/\d/3^, \$self->{'aggressiveness'}, … … 291 397 # etc/W3ImgPlug.cfg (XML) 292 398 # tag sets for captions and neartext 293 if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 10) {399 if ( $self->{'aggressiveness'} > 1 && $self->{'aggressiveness'} != 9 ) { 294 400 $self->{'delims'} = []; 295 401 $self->{'cdelims'} = []; … … 327 433 # get stop words for textual reference extraction 328 434 # TODO: warnings scroll off. Would be best to output them again at end of import 329 if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 10) {435 if ( $self->{'aggressiveness'} >=5 && $self->{'aggressiveness'} != 9 ) { 330 436 $self->{'stopwords'} = (); 331 437 $filepath = &util::filename_cat($ENV{'GSDLHOME'}, "etc", "packages", "phind", "stopword", "en", "brown.sw"); -
trunk/gsdl/perllib/plugins/WordPlug.pm
r3540 r4744 34 34 } 35 35 36 my $arguments = [ { 'name' => "process_exp", 37 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 38 'type' => "string", 39 'deft' => q^(?i)\.doc$^, 40 'reqd' => "no" } ]; 36 my $arguments = 37 [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => &get_default_process_exp(), 41 'reqd' => "no" } ]; 41 42 42 43 my $options = { 'name' => "WordPlug", 43 'desc' => "",44 45 44 'desc' => "A plugin for importing Microsoft Word documents.", 45 'inherits' => "yes", 46 'args' => $arguments }; 46 47 47 48 sub new { … … 50 51 my $self = new ConvertToPlug ($class, @_); 51 52 52 53 54 53 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 54 my $option_list = $self->{'option_list'}; 55 push( @{$option_list}, $options ); 55 56 56 57 # wvWare will always produce html files encoded as utf-8 -
trunk/gsdl/perllib/plugins/XMLPlug.pm
r3540 r4744 35 35 36 36 use XML::Parser; 37 my $arguments = [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => q^(?i)\.xml$^, 41 'reqd' => "no" } ]; 37 38 my $arguments = 39 [ { 'name' => "process_exp", 40 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. For example, using '(?i).html?\$' matches all documents ending in .htm or .html (case-insensitive).", 41 'type' => "string", 42 'deft' => &get_default_process_exp(), 43 'reqd' => "no" } ]; 42 44 43 45 my $options = { 'name' => "XMLPlug", 44 'desc' => "",45 46 46 'desc' => "Base class for XML plugins.", 47 'inherits' => "yes", 48 'args' => $arguments }; 47 49 48 50 -
trunk/gsdl/perllib/plugins/ZIPPlug.pm
r3540 r4744 58 58 59 59 my $options = { 'name' => "ZIPPlug", 60 61 60 'desc' => "Plugin which handles compressed and/or archived input formats currently handled formats and file extensions are:\ngzip (.gz, .z, .tgz, .taz)\nbzip (.bz)\nbzip2 (.bz2)\nzip (.zip .jar)\ntar (.tar)\n\nThis plugin relies on the following utilities being present (if trying to process the corresponding formats):\ngunzip (for gzip)\nbunzip (for bzip)\nbunzip2 \nunzip (for zip)\ntar (for tar)", 61 'inherits' => "yes" }; 62 62 63 63 sub new { … … 65 65 my $self = new BasPlug ("ZIPPlug", @_); 66 66 67 68 69 67 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 68 my $option_list = $self->{'option_list'}; 69 push( @{$option_list}, $options ); 70 70 71 71 return bless $self, $class;
Note:
See TracChangeset
for help on using the changeset viewer.