Changeset 3540 for trunk/gsdl/perllib
- Timestamp:
- 2002-11-18T17:43:56+13:00 (22 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 35 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify/AZCompactList.pm
r3529 r3540 50 50 } 51 51 52 my $doclevel_list = 53 [ { 'name' => "top", 54 'desc' => "Whole document." } , 55 { 'name' => "section", 56 'desc' => "By sections." } 57 ]; 58 59 my $arguments = 60 [ { 'name' => "metadata", 61 'desc' => "Metadata field used for classification. List will be sorted by this element.", 62 'type' => "metadata", 63 'reqd' => "yes" } , 64 { 'name' => "buttonname", 65 'desc' => "Button name for this classification. Defaults to metadata name.", 66 'type' => "string", 67 'reqd' => "no" } , 68 { 'name' => "mingroup", 69 'desc' => "The smallest value that will cause a group in the hierarchy to form.", 70 'type' => "int", 71 'reqd' => "no" } , 72 { 'name' => "minnesting", 73 'desc' => "The smallest value that will cause a list to converted into nested list.", 74 'type' => "int", 75 'reqd' => "no" } , 76 { 'name' => "mincompact", 77 'desc' => "Used in compact list.", 78 'type' => "int", 79 'reqd' => "no" } , 80 { 'name' => "maxcompact", 81 'desc' => "Used in compact list.", 82 'type' => "int", 83 'reqd' => "no" } , 84 { 'name' => "doclevel", 85 'desc' => "Level to process document at.", 86 'type' => "enum", 87 'list' => $doclevel_list, 88 'reqd' => "no" } , 89 { 'name' => "onlyfirst", 90 'desc' => "Control whether all or only first metadata value used from array of metadata.", 91 'type' => "flag", 92 'reqd' => "no" } 93 ]; 94 95 my $options = 96 { 'name' => "AZCompactList", 97 'desc' => "Classifier plugin for sorting alphabetically", 98 'inherits' => "Yes", 99 'args' => $arguments }; 100 52 101 sub print_usage { 53 102 print STDERR " … … 72 121 my $self = new BasClas($class, @_); 73 122 123 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 124 my $option_list = $self->{'option_list'}; 125 push( @{$option_list}, $options ); 126 127 74 128 my ($metaname, $title, $removeprefix); 75 129 my $mingroup = 2; -
trunk/gsdl/perllib/classify/AZList.pm
r3510 r3540 35 35 @ISA = ('BasClas'); 36 36 } 37 38 my $arguments = [ { 'name' => "metadata", 39 'desc' => "Metadata field used for classification. List will be sorted by this element.", 40 'type' => "metadata", 41 'reqd' => "yes" } , 42 { 'name' => "buttonname", 43 'desc' => "Button name for this classification. Defaults to metadata name.", 44 'type' => "string", 45 'reqd' => "no" } , 46 { 'name' => "removeprefix", 47 'desc' => "A prefix to ignore in the Metadata values for the field when sorting.", 48 'type' => "string", 49 'reqd' => "no" } ]; 50 51 my $options = { 'name' => "AZList", 52 'desc' => "Classifier plugin for sorting alphabetically", 53 'inherits' => "Yes", 54 'args' => $arguments }; 37 55 38 56 sub print_usage { … … 55 73 my $class = shift (@_); 56 74 my $self = new BasClas($class, @_); 75 76 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 77 my $option_list = $self->{'option_list'}; 78 push( @{$option_list}, $options ); 57 79 58 80 my ($metaname, $title, $removeprefix); -
trunk/gsdl/perllib/classify/AZSectionList.pm
r2954 r3540 34 34 # to the classification 35 35 36 # 12/05/02 Added usage datastructure - John Thompson 37 36 38 package AZSectionList; 37 39 … … 43 45 } 44 46 47 my $arguments = 48 [ { 'name' => "metadata", 49 'desc' => "Metadata field used for classification. List will be sorted by this element.", 50 'type' => "metadata", 51 'reqd' => "yes" } , 52 { 'name' => "buttonname", 53 'desc' => "Button name for this classification. Defaults to metadata name.", 54 'type' => "string", 55 'reqd' => "no" } 56 ]; 57 58 my $options = 59 { 'name' => "AZSectionList", 60 'desc' => "Classifier plugin for sorting alphabetically. This is very similar to AZList except it sorts by section level metadata (excluding the top level) instead of just top level metadata. The only change is to the classify() subroutine which must now iterate through each section, adding each to the classification.", 61 'inherits' => "Yes", 62 'args' => $arguments }; 63 45 64 sub print_usage { 46 65 print STDERR " 47 usage: classify AZSectionList -metadata X[options]66 usage: classify AZSectionList [options] 48 67 options: 49 68 50 -metadata X (required) Metadata field used for classification.51 List will be sorted by this element.69 -metadata X (required) Metadata field used for classification, 70 list will be sorted by this element. 52 71 53 -buttonname X Button namefor this classification.54 defaults to metadataname.72 -buttonname X (OPTIONAL) Title field for this classification. 73 if not included title field will be Metaname. 55 74 56 -removeprefix regex A prefix to ignore in the Metadata values 57 for the field when sorting. 58 75 -removeprefix regex A prefix to ignore in the Metadata values 76 for the field when sorting. 59 77 This is very similar to AZList except it sorts by section level metadata 60 78 (excluding the top level) instead of just top level metadata. … … 65 83 my $class = shift (@_); 66 84 my $self = new AZList($class, @_); 85 86 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 87 my $option_list = $self->{'option_list'}; 88 push( @{$option_list}, $options ); 67 89 68 90 return bless $self, $class; … … 91 113 # if this section doesn't contain the metadata element we're 92 114 # sorting by we won't include it in this classification 115 93 116 if (defined $metavalue && $metavalue ne "") { 94 117 if ($self->{'removeprefix'}) { 95 118 $metavalue =~ s/^$self->{'removeprefix'}//; 96 119 } 97 98 120 if ($self->{'metaname'} eq 'Creator') { 99 121 &sorttools::format_string_name_english (\$metavalue); -
trunk/gsdl/perllib/classify/BasClas.pm
r1885 r3540 50 50 # display it. 51 51 52 # 09/05/02 Added usage datastructure - John Thompson 53 52 54 use parsargv; 55 56 my $verbosity_list = 57 [ { 'name' => "0", 58 'desc' => "" } , 59 { 'name' => "1", 60 'desc' => "" } , 61 { 'name' => "2", 62 'desc' => "" } , 63 { 'name' => "3", 64 'desc' => "" } 65 ]; 66 67 my $arguments = 68 [ { 'name' => "verbosity", 69 'desc' => "", 70 'type' => "enum", 71 'list' => $verbosity_list, 72 'deft' => "2", 73 'reqd' => "no" } ]; 74 75 my $options = 76 { 'name' => "BasClas", 77 'desc' => "Base class for all the classifiers.", 78 'inherits' => "No", 79 'args' => $arguments }; 80 81 sub print_xml_usage { 82 my $self = shift (@_); 83 print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"; 84 $self->print_xml(); 85 } 86 87 sub print_xml { 88 my $self = shift (@_); 89 my $option_list = $self->{'option_list'}; 90 my $option = pop( @{$option_list} ); 91 if(defined $option) 92 { 93 print STDERR "<ClassInfo>\n"; 94 print STDERR " <Name>$option->{'name'}</Name>\n"; 95 print STDERR " <Desc>$option->{'desc'}</Desc>\n"; 96 print STDERR " <Inherits>$option->{'inherits'}</Inherits>\n"; 97 print STDERR " <Arguments>\n"; 98 if(defined $option->{'args'}) 99 { 100 my $args = $option->{'args'}; 101 my $x; 102 foreach $x ( @{$args} ) 103 { 104 print STDERR " <Option>\n"; 105 print STDERR " <Name>$x->{'name'}</Name>\n"; 106 print STDERR " <Desc>$x->{'desc'}</Desc>\n"; 107 print STDERR " <Type>$x->{'type'}</Type>\n"; 108 print STDERR " <Required>$x->{'reqd'}</Required>\n"; 109 if(defined $x->{'list'}) 110 { 111 print STDERR " <List>\n"; 112 my $list = $x->{'list'}; 113 my $y; 114 foreach $y ( @{$list} ) 115 { 116 print STDERR " <Value>\n"; 117 print STDERR " <Name>$y->{'name'}</Name>\n"; 118 print STDERR " <Desc>$y->{'desc'}</Desc>\n"; 119 print STDERR " </Value>\n"; 120 } 121 # Special case of 'input_encoding' 122 if( $x->{'name'} =~ m/^input_encoding$/i ) { 123 my $e = $encodings::encodings; 124 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { 125 print STDERR " <Value>\n"; 126 print STDERR " <Name>$enc</Name>\n"; 127 print STDERR " <Desc>$e->{$enc}->{'name'}</Desc>\n"; 128 print STDERR " </Value>\n"; 129 } 130 } 131 print STDERR " </List>\n"; 132 } 133 if(defined $x->{'deft'}) 134 { 135 print STDERR " <Default>$x->{'deft'}</Default>\n"; 136 } 137 print STDERR " </Option>\n"; 138 } 139 } 140 if(defined $option_list) { 141 $self->print_xml(); 142 } 143 144 print STDERR " </Arguments>\n"; 145 print STDERR "</ClassInfo>\n"; 146 } 147 } 53 148 54 149 sub print_general_usage { … … 79 174 $self->{'outhandle'} = STDERR; 80 175 176 $self->{'option_list'} = [ $options ]; 177 81 178 # general options available to all classifiers 82 179 if (!parsargv::parse(\@_, -
trunk/gsdl/perllib/classify/Browse.pm
r2489 r3540 24 24 ########################################################################### 25 25 26 # 12/05/02 Added usage datastructure - John Thompson 27 26 28 use BasClas; 27 29 package Browse; … … 32 34 @ISA = ('BasClas'); 33 35 } 36 37 my $options = 38 { 'name' => "Browse", 39 'desc' => "", 40 'inherits' => "Yes" }; 34 41 35 42 sub print_usage { … … 43 50 my $class = shift (@_); 44 51 my $self = new BasClas($class, @_); 45 52 53 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 54 my $option_list = $self->{'option_list'}; 55 push( @{$option_list}, $options ); 46 56 47 57 # classifier information -
trunk/gsdl/perllib/classify/DateList.pm
r2916 r3540 33 33 # jrm21 - added option "bymonth", which splits by year and month. 34 34 35 # 12/05/02 Added usage datastructure - John Thompson 36 35 37 package DateList; 36 38 … … 41 43 @ISA = ('BasClas'); 42 44 } 45 46 my $arguments = 47 [ { 'name' => "bymonth", 48 'desc' => "Classify by year and month.", 49 'type' => "flag", 50 'reqd' => "no" } 51 ]; 52 53 my $options = 54 { 'name' => "DateList", 55 'desc' => "Classifier plugin for sorting by date. Always sorts by 'Date' metadata. Date is assumed to be in the form yyyymmdd.", 56 'inherits' => "Yes", 57 'args' => $arguments }; 43 58 44 59 sub print_usage { … … 48 63 -bymonth [or bymonth=1] Classify by year and month 49 64 50 Classifier plugin for sorting by date, and assumes that 'Date' metadata 51 exists. Date is assumed to be in the form yyyymmdd (all digits). 52 By default dates are classified by year. 53 65 Classifier plugin for sorting by date. 66 Always sorts by 'Date' metadata. 67 Date is assumed to be in the form yyyymmdd (all digits). 68 By default dates are split by year - this should change. 69 70 Any errors are Dana's problem. 54 71 "; 55 72 } … … 58 75 my $class = shift (@_); 59 76 my $self = new BasClas($class, @_); 77 78 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 79 my $option_list = $self->{'option_list'}; 80 push( @{$option_list}, $options ); 60 81 61 82 $self->{'list'} = {}; -
trunk/gsdl/perllib/classify/HTML.pm
r2022 r3540 31 31 # url=url -- the url of the web page to link to 32 32 33 # 12/05/02 Added usage datastructure - John Thompson 34 33 35 package HTML; 34 36 … … 38 40 @ISA = ('BasClas'); 39 41 } 42 43 my $arguments = 44 [ { 'name' => "url", 45 'desc' => "The url of the web page to link to.", 46 'type' => "string", 47 'reqd' => "yes" } , 48 { 'name' => "buttonname", 49 'desc' => "The title field for this classification. If not included title field 'Browse'.", 50 'type' => "string", 51 'reqd' => "no" } 52 ]; 53 54 my $options = 55 { 'name' => "HTML", 56 'desc' => "Creates an empty classification that's simply a link to a web page.", 57 'inherits' => "Yes", 58 'args' => $arguments }; 40 59 41 60 sub print_usage { … … 55 74 my $class = shift (@_); 56 75 my $self = new BasClas($class, @_); 76 77 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 78 my $option_list = $self->{'option_list'}; 79 push( @{$option_list}, $options ); 57 80 58 81 my ($title, $url); -
trunk/gsdl/perllib/classify/Hierarchy.pm
r2973 r3540 41 41 # like an AZList classification) 42 42 43 # 12/05/02 Added usage datastructure - John Thompson 44 # 12/05/02 Modified new() so as not to die on error, only on init() - John Thompson 45 43 46 package Hierarchy; 44 47 … … 52 55 } 53 56 57 my $arguments = 58 [ { 'name' => "metadata", 59 'desc' => "Metadata field used for classification. List will be sorted by this element.", 60 'type' => "metadata", 61 'reqd' => "yes" } , 62 { 'name' => "buttonname", 63 'desc' => "Button name for this classification. Defaults to metadata name.", 64 'type' => "string", 65 'reqd' => "no" } , 66 { 'name' => "hfile", 67 'desc' => "The classification structure file.", 68 'type' => "string", 69 'reqd' => "yes" } , 70 { 'name' => "sort", 71 'desc' => "Metadata field to sort by (defaults to none).", 72 'type' => "string", 73 'reqd' => "no" } , 74 { 'name' => "hlist_at_top", 75 'desc' => "Display the first level of the classification horizontally.", 76 'type' => "flag", 77 'reqd' => "no" } 78 ]; 79 80 my $options = 81 { 'name' => "Hierarchy", 82 'desc' => "Classifier plugin for generating hierarchical classifications", 83 'inherits' => "Yes" , 84 'args' => $arguments }; 85 54 86 sub print_usage { 55 87 print STDERR " … … 76 108 my $class = shift (@_); 77 109 my $self = new BasClas($class, @_); 78 110 111 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 112 my $option_list = $self->{'option_list'}; 113 push( @{$option_list}, $options ); 114 79 115 my $sortname = "Title"; 80 116 my ($hfile, $metadata, $title, $hlist_at_top); … … 88 124 "allow_extra_options")) { 89 125 90 print STDERR "\nIncorrect options passed to $class, check your collect.cfg file\n"; 91 &print_usage(); 92 die "\n"; 126 $self->{'construction_error'} = "Incorrect options passed to $class, check your collect.cfg file."; 93 127 } 94 128 95 129 if (!$metadata) { 96 &print_usage; 97 print STDERR "\nHierarchy error: no metadata supplied\n"; 98 die "\n"; 130 $self->{'construction_error'} = "Hierarchy error: no metadata supplied."; 99 131 } 100 132 … … 103 135 $sortname = undef if $sortname =~ /^nosort$/; 104 136 137 my $subjectfile; 138 105 139 if (!$hfile) { 106 &print_usage;107 print STDERR "\nHierarchy error: No -hfile supplied\n"; 108 die "\n"; 109 } 110 111 my $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile); 112 if (!-e $subjectfile) { 113 my $collfile = $subjectfile;114 $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile);115 if (!-e $subjectfile) {116 my $outhandle = $self->{'outhandle'};117 &print_usage;118 print STDERR "\nHierarchy Error: Can't locate subject file $hfile\n";119 print STDERR "This file should be in $collfile or $subjectfile\n";120 die "\n";121 }122 140 $self->{'construction_error'} = "Hierarchy error: No -hfile supplied."; 141 } 142 else 143 { 144 $subjectfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $hfile); 145 if (!-e $subjectfile) { 146 my $collfile = $subjectfile; 147 $subjectfile = &util::filename_cat($ENV{'GSDLHOME'},"etc", $hfile); 148 if (!-e $subjectfile) { 149 my $outhandle = $self->{'outhandle'}; 150 &print_usage; 151 print STDERR "\nHierarchy Error: Can't locate subject file $hfile\n"; 152 print STDERR "This file should be in $collfile or $subjectfile\n"; 153 die "\n"; 154 } 155 } 156 } 123 157 124 158 $self->{'descriptorlist'} = {}; # first field in subject file … … 135 169 sub init { 136 170 my $self = shift (@_); 171 172 if(defined $self->{'construction_error'} || !defined $self->{'metaname'} || !defined $self->{'subjectfile'}) { 173 print STDERR "Error: " , $self->{'construction_error'} , "\n"; 174 &print_usage; 175 die "\n"; 176 } 137 177 138 178 # read in the subject file … … 230 270 $classifyinfo->{'Title'} = $title; 231 271 $classifyinfo->{'classifytype'} = $classifytype; 232 233 272 return $classifyinfo; 234 273 } 235 274 236 275 $classifyinfo->{'contains'} = [] unless defined $classifyinfo->{'contains'}; 237 238 276 my $offset = 0; 239 277 foreach $thing (@{$classifyinfo->{'contains'}}) { 240 278 $offset ++ if defined $thing->{'OID'}; 241 279 } 242 243 while (scalar(@{$classifyinfo->{'contains'}}) < ($headOID+$offset)) { 280 281 while (scalar(@{$classifyinfo->{'contains'}}) < ($headOID+$offset)) { 244 282 push (@{$classifyinfo->{'contains'}}, $self->get_entry("", $classifytype)); 245 283 } -
trunk/gsdl/perllib/classify/List.pm
r2022 r3540 38 38 # if metadata is also not included title will be 'List' 39 39 40 # 12/05/02 Added usage datastructure - John Thompson 41 40 42 use BasClas; 41 43 package List; … … 46 48 @ISA = ('BasClas'); 47 49 } 50 51 my $arguments = 52 [ { 'name' => "metadata", 53 'desc' => "Metadata field used for classification. List will be sorted by this element.", 54 'type' => "metadata", 55 'reqd' => "yes" } , 56 { 'name' => "buttonname", 57 'desc' => "Button name for this classification. Defaults to metadata name.", 58 'type' => "string", 59 'reqd' => "no" } , 60 { 'name' => "sort", 61 'desc' => "Sort documents in list by this metadata field. By default it will sort by Metaname, or (if this is not set) in build (random) order.", 62 'type' => "string", 63 'reqd' => "no" } 64 ]; 65 66 my $options = 67 { 'name' => "List", 68 'desc' => "Simple list classifier plugin.", 69 'inherits' => "Yes", 70 'args' => $arguments }; 71 48 72 49 73 sub print_usage { … … 67 91 my $class = shift (@_); 68 92 my $self = new BasClas($class, @_); 93 94 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 95 my $option_list = $self->{'option_list'}; 96 push( @{$option_list}, $options ); 69 97 70 98 my ($metaname, $title, $sortname, $list); -
trunk/gsdl/perllib/classify/Phind.pm
r3536 r3540 31 31 # Type "classinfo.pl Phind" at the command line for a summary. 32 32 33 # 12/05/02 Added usage datastructure - John Thompson 34 33 35 package Phind; 34 36 … … 82 84 } 83 85 86 my $arguments = 87 [ { 'name' => "text", 88 'desc' => "The text used to build the phrase hierarchy (default: 'section:Title,section:text').", 89 'type' => "string", 90 'reqd' => "no" } , 91 { 'name' => "title", 92 'desc' => "The metadata field used to describe each document (default: 'Title').", 93 'type' => "metadata", 94 'reqd' => "no" } , 95 { 'name' => "button", 96 'desc' => "The label for the classifier screen and button in navigation bar (default: 'Phrase').", 97 'type' => "string", 98 'reqd' => "no" } , 99 { 'name' => "language", 100 'desc' => "Language or languages to use building hierarchy. Languages are identified by two-letter country codes like en (English), es (Spanish), and fr (French). Language is a regular expression, so 'en|fr' (English or French) and '..' (match any language) are valid (default: 'en').", 101 'type' => "language", 102 'reqd' => "no" } , 103 { 'name' => "savephrases", 104 'desc' => "If set, the phrase infomation will be stored in the given file as text. It is probably a good idea to use an absolute path (default: not set).", 105 'type' => "string", 106 'reqd' => "no" } , 107 { 'name' => "suffixmode", 108 'desc' => "The smode parameter to the phrase extraction program. A value of 0 means that stopwords are ignored, and of 1 means that stopwords are used (default: 1).", 109 'type' => "int", 110 'reqd' => "no" } , 111 { 'name' => "thesaurus", 112 'desc' => "Name of a thesaurus stored in Phind format in the collection's etc directory (default: not set).", 113 'type' => "string", 114 'reqd' => "no" } , 115 { 'name' => "untidy", 116 'desc' => "Don't remove working files.", 117 'type' => "flag", 118 'reqd' => "no" } 119 ]; 120 121 my $options = 122 { 'name' => "Phind", 123 'desc' => "The Phind clasifier plugin.", 124 'inherits' => "Yes", 125 'args' => $arguments }; 126 84 127 sub print_usage { 85 128 print STDERR " … … 114 157 (default: 1) 115 158 116 -thesaurus Name Name of a thesaurus stored in phind format in the159 -thesaurus Name Name of a thesaurus stored in Phind format in the 117 160 collection's etc directory. 118 161 (default: not set) … … 136 179 my $class = shift (@_); 137 180 my $self = new BasClas($class, @_); 181 182 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 183 my $option_list = $self->{'option_list'}; 184 push( @{$option_list}, $options ); 138 185 139 186 my $out = $self->{'outhandle'}; … … 283 330 284 331 # Extract the text from every section 285 # (In Phind, document:text and section:text are equivalent)332 # (In phind, document:text and section:text are equivalent) 286 333 if ($field eq "text") { 287 334 $data = ""; … … 331 378 # 332 379 # When get_classify_info is called, the clauses and docs.txt files have 333 # already been constructed in the phind directory. This function will380 # already been constructed in the Phind directory. This function will 334 381 # translate them into compressed, indexed MGPP files that can be read by 335 382 # the phindcgi script. It will also register our classifier so that it … … 355 402 } 356 403 357 # Construct Phind indexes404 # Construct phind indexes 358 405 my $suffixmode = $self->{'suffixmode'}; 359 406 my ($command, $status); … … 363 410 print $out "\nExtracting vocabulary and statistics\n" if $verbosity; 364 411 &extract_vocabulary($self); 365 412 366 413 # Use the suffix program to generate the phind/phrases file 367 414 print $out "\nExtracting phrases from processed text (with suffix)\n" if $verbosity; 368 415 &execute("suffix \"$phinddir\" $suffixmode $verbosity", $verbosity, $out); 369 370 416 371 417 # check that we generated some files. It's not necessarily an error if … … 375 421 print $out "\nNo phrases found for Phind classifier!\n"; 376 422 return; 377 } 423 } 378 424 379 425 # Create the phrase file and put phrase numbers in phind/phrases … … 445 491 return &convert_gml_to_tokens_EN($text); 446 492 } 493 447 494 if ($language_exp =~ /zh/) { 448 495 return &convert_gml_to_tokens_ZH($text); 449 } 496 } 450 497 451 498 $_ = $text; … … 477 524 # 2. Split the remaining text into space-delimited tokens 478 525 479 # Convert entities to their UTF8 equivalents480 s/&([^;]+);/& ghtml::getcharequiv($1,1)/gse;526 # Convert any HTML special characters (like ") to their UTF8 equivalent 527 s/&([^;]+);/&unicode::ascii2utf8(\&ghtml::getcharequiv($1,1))/gse; 481 528 482 529 # Split text at word boundaries … … 541 588 return $_; 542 589 } 590 543 591 # A version of convert_gml_to_tokens that is fine-tuned to the English language. 544 592 … … 641 689 if ($status != 0) { 642 690 print STDERR "Phind - Error executing '$command': $!\n"; 643 exit($status); # this causes the build to fail...691 exit($status); # this causes the build to fail... 644 692 } 645 693 } -
trunk/gsdl/perllib/classify/SectionList.pm
r2022 r3540 28 28 # itself 29 29 30 # 12/05/02 Added usage datastructure - John Thompson 31 30 32 package SectionList; 31 33 … … 36 38 @ISA = ('List'); 37 39 } 40 41 my $arguments = 42 [ { 'name' => "metadata", 43 'desc' => "Metadata field used for classification. List will be sorted by this element.", 44 'type' => "metadata", 45 'reqd' => "yes" } , 46 { 'name' => "buttonname", 47 'desc' => "Button name for this classification. Defaults to metadata name.", 48 'type' => "string", 49 'reqd' => "no" } , 50 { 'name' => "sort", 51 'desc' => "Sort documents in list by this metadata field. By default it will sort by Metaname, or (if this is not set) in build (random) order.", 52 'type' => "string", 53 'reqd' => "no" } 54 ]; 55 56 my $options = 57 { 'name' => "SectionList", 58 'desc' => "Same as List classifier but includes all sections of document (excluding top level) rather than just top level document itself.", 59 'inherits' => "Yes", 60 'args' => $arguments }; 38 61 39 62 sub print_usage { … … 60 83 my $class = shift (@_); 61 84 my $self = new List($class, @_); 85 86 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 87 my $option_list = $self->{'option_list'}; 88 push( @{$option_list}, $options ); 62 89 63 90 return bless $self, $class; -
trunk/gsdl/perllib/plugins/ArcPlug.pm
r1424 r3540 28 28 # when an import is done), processing each file it finds 29 29 30 # 12-05-02 Added usage datastructure - John Thompson 31 30 32 package ArcPlug; 31 33 … … 39 41 } 40 42 43 my $options = 44 { 'name' => "ArcPlug", 45 'desc' => "Plugin which recurses through an archives.inf file (i.e. the file generated in the archives directory when an import is done), processing each file it finds.", 46 'inherits' => "Yes" }; 47 41 48 sub new { 42 49 my ($class) = @_; 43 50 my $self = new BasPlug ("ArcPlug", @_); 51 52 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 53 my $option_list = $self->{'option_list'}; 54 push( @{$option_list}, $options ); 44 55 45 56 return bless $self, $class; -
trunk/gsdl/perllib/plugins/BasPlug.pm
r3515 r3540 42 42 use ghtml; 43 43 44 my $unicode_list = 45 [ { 'name' => "auto", 46 'desc' => "Use text categorization algorithm to automatically identify the encoding of each source document. This will be slower than explicitly setting the encoding but will work where more than one encoding is used within the same collection." } , 47 { 'name' => "ascii", 48 'desc' => "Plain 7 bit ascii. This may be a bit faster than using iso_8859_1. Beware of using this on a collection of documents that may contain characters outside the plain 7 bit ascii set though (e.g. German or French documents containing accents), use iso_8859_1 instead." }, 49 { 'name' => "utf8", 50 'desc' => "either utf8 or unicode -- automatically detected." }, 51 { 'name' => "unicode", 52 'desc' => "just unicode" } ]; 53 54 my $arguments = 55 [ { 'name' => "process_exp", 56 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 57 'type' => "string", 58 'deft' => "", 59 'reqd' => "no" }, 60 { 'name' => "block_exp", 61 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 62 'type' => 'string', 63 'deft' => "", 64 'reqd' => "no" }, 65 { 'name' => "input_encoding", 66 'desc' => "The encoding of the source documents. Documents will be converted from these encodings and stored internally as utf8. The default input_encoding is 'auto'.", 67 'type' => "enum", 68 'list' => $unicode_list, 69 'reqd' => "no" , 70 'deft' => "auto" } , 71 { 'name' => "default_encoding", 72 'desc' => "Use this encoding if -input_encoding is set to 'auto' and the text categorization algorithm fails to extract the encoding or extracts an encoding unsupported by Greenstone. The default is iso_8859_1.", 73 'type' => "flag", 74 'reqd' => "no" }, 75 { 'name' => "extract_language", 76 'desc' => "Identify the language of each document and set 'Language' metadata. Note that this will be done automatically if -input_encoding is 'auto'.", 77 'type' => "flag", 78 'reqd' => "no" }, 79 { 'name' => "default_language", 80 'desc' => "If Greenstone fails to work out what language a document is the 'Language' metadata element will be set to this value. The default is 'en' (ISO 639 language symbols are used: en = English). Note that if -input_encoding is not set to 'auto' and -extract_language is not set, all documents will have their 'Language' metadata set to this value.", 81 'type' => "language", 82 'deft' => "en", 83 'reqd' => "no" }, 84 { 'name' => "extract_acronyms", 85 'desc' => "Extract acronyms from within text and set as metadata.", 86 'type' => "flag", 87 'reqd' => "no" }, 88 { 'name' => "markup_acronyms", 89 'desc' => "Add acronym metadata into document text.", 90 'type' => "flag", 91 'reqd' => "no" }, 92 { 'name' => "first", 93 'desc' => "Comma separated list of first sizes to extract from the text into a metadata field. The field is called 'FirstNNN'.", 94 'type' => "string", 95 'reqd' => "no" }, 96 { 'name' => "extract_email", 97 'desc' => "Extract email addresses as metadata.", 98 'type' => "flag", 99 'reqd' => "no" }, 100 { 'name' => "extract_historical_years", 101 'desc' => "Extract time-period information from historical documents. This is stored as metadata with the document. There is a search interface for this metadata, which you can include in your collection by adding the statement, \"format QueryInterface DateSearch\" to your collection configuration file.", 102 'type' => "flag", 103 'reqd' => "no" }, 104 { 'name' => "maximum_year", 105 'desc' => "The maximum historical date to be used as metadata (in a Common Era date, such as 1950).", 106 'type' => "int", 107 'reqd' => "no"}, 108 { 'name' => "maximum_century", 109 'desc' => "The maximum named century to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century).", 110 'type' => "int", 111 'reqd' => "no" }, 112 { 'name' => "no_bibliography", 113 'desc' => "Do not try and block bibliographic dates when extracting historical dates.", 114 'type' => "flag", 115 'reqd' => "no"}, 116 { 'name' => "cover_image", 117 'desc' => "Will look for a prefix.jpg file (where prefix is the same prefix as the file being processed) and associate it as a cover image.", 118 'type' => "flag", 119 'reqd' => "no" } ]; 120 121 my $options = { 'name' => "BasPlug", 122 'desc' => "Base class for all the import plugins.", 123 'inherits' => "No", 124 'args' => $arguments, 125 'process_exp' => "", 126 'block_exp' => "" }; 127 128 sub print_xml_usage { 129 my $self = shift (@_); 130 print STDERR "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"; 131 $self->print_xml(); 132 } 133 134 sub print_xml { 135 my $self = shift (@_); 136 my $option_list = $self->{'option_list'}; 137 my $option = pop( @{$option_list} ); 138 if(defined $option) 139 { 140 print STDERR "<PlugInfo>\n"; 141 print STDERR " <Name>$option->{'name'}</Name>\n"; 142 print STDERR " <Desc>$option->{'desc'}</Desc>\n"; 143 print STDERR " <Inherits>$option->{'inherits'}</Inherits>\n"; 144 print STDERR " <Arguments>\n"; 145 if(defined $option->{'args'}) 146 { 147 my $args = $option->{'args'}; 148 my $x; 149 foreach $x ( @{$args} ) 150 { 151 print STDERR " <Option>\n"; 152 print STDERR " <Name>$x->{'name'}</Name>\n"; 153 print STDERR " <Desc>$x->{'desc'}</Desc>\n"; 154 print STDERR " <Type>$x->{'type'}</Type>\n"; 155 print STDERR " <Required>$x->{'reqd'}</Required>\n"; 156 if(defined $x->{'list'}) 157 { 158 print STDERR " <List>\n"; 159 my $list = $x->{'list'}; 160 my $y; 161 foreach $y ( @{$list} ) 162 { 163 print STDERR " <Value>\n"; 164 print STDERR " <Name>$y->{'name'}</Name>\n"; 165 print STDERR " <Desc>$y->{'desc'}</Desc>\n"; 166 print STDERR " </Value>\n"; 167 } 168 # Special case of 'input_encoding' 169 if( $x->{'name'} =~ m/^input_encoding$/i ) { 170 my $e = $encodings::encodings; 171 foreach my $enc (sort {$e->{$a}->{'name'} cmp $e->{$b}->{'name'}} keys (%$e)) { 172 print STDERR " <Value>\n"; 173 print STDERR " <Name>$enc</Name>\n"; 174 print STDERR " <Desc>$e->{$enc}->{'name'}</Desc>\n"; 175 print STDERR " </Value>\n"; 176 } 177 } 178 print STDERR " </List>\n"; 179 } 180 if(defined $x->{'deft'}) 181 { 182 print STDERR " <Default>$x->{'deft'}</Default>\n"; 183 } 184 print STDERR " </Option>\n"; 185 } 186 } 187 if(defined $option_list) { 188 $self->print_xml(); 189 } 190 191 print STDERR " </Arguments>\n"; 192 print STDERR "</PlugInfo>\n"; 193 } 194 } 195 44 196 sub print_general_usage { 45 197 my ($plugin_name) = @_; … … 154 306 $self->{'num_blocked'} = 0; 155 307 $self->{'num_archives'} = 0; 308 309 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 310 $self->{'option_list'} = [ $options ]; 156 311 157 312 # general options available to all plugins -
trunk/gsdl/perllib/plugins/BibTexPlug.pm
r3426 r3540 51 51 } 52 52 53 my $arguments = 54 [ { 'name' => "process_exp", 55 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 56 'type' => "string", 57 'reqd' => "no" , 58 'deft' => q^(?i)\.bib$^ } 59 ]; 60 61 my $options = 62 { 'name' => "BibTexPlug", 63 'desc' => "BibTexPlug reads bibliography files in BibTex format. BibTexPlug creates a document object for every reference a the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.", 64 'inherits' => "Yes", 65 'args' => $arguments }; 66 53 67 # This plugin processes files with the suffix ".bib" 54 68 sub get_default_process_exp { … … 59 73 sub get_default_split_exp { 60 74 return q^\n+(?=@)^; 75 } 76 sub new { 77 my $class = shift (@_); 78 my $self = new SplitPlug ($class, @_); 79 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 80 my $option_list = $self->{'option_list'}; 81 push( @{$option_list}, $options ); 82 return bless $self, $class; 61 83 } 62 84 … … 368 390 369 391 $text =~ s/([\w\d\.\-]+@[\w\d\.\-]+)/<a href=\"mailto:$1\">$1<\/a>/g; 370 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1 ">$1<\/a>/g;392 $text =~ s/(http:\/\/[\w\d\.\-]+[\/\w\d\.\-]*)/<a href=\"$1\">$1<\/a>/g; 371 393 372 394 return $text; -
trunk/gsdl/perllib/plugins/BookPlug.pm
r2356 r3540 49 49 # use this plugin instead of HBPlug. 50 50 51 # 12/05/02 Added usage datastructure - John Thompson 52 51 53 package BookPlug; 52 54 … … 57 59 @ISA = ('BasPlug'); 58 60 } 61 62 my $arguments = 63 [ { 'name' => "process_exp", 64 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 65 'type' => "string", 66 'reqd' => "no", 67 'deft' => q^(?i)\.hb$^} , 68 { 'name' => "block_exp", 69 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 70 'type' => "string", 71 'reqd' => "no", 72 'deft' => q^\.jpg$^} 73 ]; 74 75 my $options = 76 { 'name' => "BookPlug", 77 'desc' => "Creates multi-level document from document containing <<TOC>> level tags. Metadata for each section is taken from any other tags on the same line as the <<TOC>>. e.g. <<Title>>xxxx<</Title>> sets Title metadata. Everything else between TOC tags is treated as simple html (i.e. no processing of html links or any other HTMLPlug type stuff is done). Expects input files to have a .hb file extension by default (this can be changed by adding a -process_exp option a file with the same name as the hb file but a .jpg extension is taken as the cover image (jpg files are blocked by this plugin). BookPlug is a simplification (and extension) of the HBPlug used by the Humanity Library collections. BookPlug is faster as it expects the input files to be cleaner (The input to the HDL collections contains lots of excess html tags around <<TOC>> tags, uses <<I>> tags to specify images, and simply takes all text between <<TOC>> tags and start of text to be Title metadata). If you're marking up documents to be displayed in the same way as the HDL collections, use this plugin instead of HBPlug.", 78 'inherits' => "Yes", 79 'args' => $arguments }; 59 80 60 81 sub new { 61 82 my ($class) = @_; 62 83 my $self = new BasPlug ("BookPlug", @_); 63 84 85 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 86 my $option_list = $self->{'option_list'}; 87 push( @{$option_list}, $options ); 88 64 89 return bless $self, $class; 65 90 } -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r3350 r3540 48 48 # @ISA = ('BasPlug'); #, 'HTMLPlug', 'TEXTPlug'); 49 49 } 50 51 my $convert_to_list = 52 [ { 'name' => "html", 53 'desc' => "" }, 54 { 'name' => "text", 55 'desc' => "" } 56 ]; 57 58 my $arguments = 59 [ { 'name' => "convert_to", 60 'desc' => "Plugin converts to TEXT or HTML (default html).", 61 'type' => "enum", 62 'reqd' => "no", 63 'list' => $convert_to_list, 64 'deft' => "html"} 65 ]; 66 67 my $options = 68 { 'name' => "ConvertToPlug", 69 'desc' => "The plugin is inherited by such plugins as WordPlug and PDFPlug. It facilitates the conversion of these document types to either HTML or TEXT by setting up variable that instruct ConvertToBasPlug how to work. It works by dynamically inheriting HTMLPlug or TEXTPlug based on the plugin argument 'convert_to'. If the argument is not present, the default is to inherit HTMLPlug.", 70 'inherits' => "Yes", 71 'args' => $arguments }; 72 50 73 51 74 sub print_usage { … … 122 145 $self->{'metadata_fields'} .= ",GENERATOR"; 123 146 } 147 148 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 149 my $option_list = $self->{'option_list'}; 150 push( @{$option_list}, $options ); 124 151 125 152 foreach my $key (keys %$args) { -
trunk/gsdl/perllib/plugins/EMAILPlug.pm
r3524 r3540 61 61 # * RFC 2047 - MIME (part 3) Message Header Extensions 62 62 # * RFC 1806 - Content Dispositions (ie inline/attachment) 63 64 # 12/05/02 Added usage datastructure - John Thompson 65 63 66 package EMAILPlug; 64 67 … … 85 88 } 86 89 90 my $arguments = 91 [ { 'name' => "process_exp", 92 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 93 'type' => "string", 94 'reqd' => "no", 95 'deft' => q^(?i)\.hb$^} , 96 { 'name' => "block_exp", 97 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 98 'type' => "string", 99 'reqd' => "no", 100 'deft' => q^\.jpg$^} 101 ]; 102 103 my $options = 104 { 'name' => "EMAILPlug", 105 'desc' => "Email plug reads email files. These are named with a simple number (i.e. as they appear in maildir folders) or with the extension .mbx (for mbox mail file format).\nDocument text: The document text consists of all the text after the first blank line in the document.\nMetadata (not Dublin Core!):\n\t\$Headers All the header content\n\t\$Subject Subject: header\n\t\$To To: header\n\t\$From From: header\n\t\$FromName Name of sender (where available)\n\t\$FromAddr E-mail address of sender\n\t\$DateText Date: header\n\t\$Date Date: header in GSDL format (eg: 19990924)", 106 'inherits' => "Yes", 107 'args' => $arguments }; 108 87 109 # Create a new EMAILPlug object with which to parse a file. 88 110 # Accomplished by creating a new BasPlug and using bless to … … 92 114 my ($class) = @_; 93 115 my $self = new BasPlug ($class, @_); 116 117 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 118 my $option_list = $self->{'option_list'}; 119 push( @{$option_list}, $options ); 94 120 95 121 if (!parsargv::parse(\@_, -
trunk/gsdl/perllib/plugins/FOXPlug.pm
r2327 r3540 28 28 # This general plugin should be overridden for a particular database to process 29 29 # the appropriate fields in the file. 30 31 # 12/05/02 Added usage datastructure - John Thompson 30 32 31 33 package FOXPlug; … … 43 45 } 44 46 47 my $options = { 'name' => "FOXPlug", 48 'desc' => "Plugin to process a Foxbase dbt file. This plugin provides the basic functionality to read in the dbt and dbf files and process each record. This general plugin should be overridden for a particular database to process the appropriate fields in the file.", 49 'inherits' => "yes" }; 50 45 51 sub new { 46 52 my ($class) = @_; 47 53 $self = new BasPlug (); 54 55 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 56 my $option_list = $self->{'option_list'}; 57 push( @{$option_list}, $options ); 48 58 49 59 return bless $self, $class; -
trunk/gsdl/perllib/plugins/GAPlug.pm
r2925 r3540 24 24 ########################################################################### 25 25 26 # Processes Greenstone Archive XML documents. Note that this plugin does no 27 # syntax checking (though the XML::Parser module tests for well-formedness). 28 # It's assumed that the Archive files conform to their DTD. 26 # Processes GreenstoneArchive XML documents. Note that this plugin does no 27 # syntax checking (though the XML::Parser module tests for 28 # well-formedness). It's assumed that the GreenstoneArchive files conform 29 # to their DTD. 30 31 # 12/05/02 Added usage datastructure - John Thompson 29 32 30 33 package GAPlug; … … 36 39 } 37 40 41 my $options = { 'name' => "GAPlug", 42 'desc' => "Processes GreenstoneArchive XML documents. Note that this plugin does no syntax checking (though the XML::Parser module tests for well-formedness). It's assumed that the GreenstoneArchive files conform to their DTD.", 43 'inherits' => "yes" }; 44 38 45 sub new { 39 46 my $class = shift (@_); 40 47 my $self = new XMLPlug ($class, @_); 48 49 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 50 my $option_list = $self->{'option_list'}; 51 push( @{$option_list}, $options ); 41 52 42 53 $self->{'section'} = ""; … … 156 167 157 168 1; 169 170 -
trunk/gsdl/perllib/plugins/GMLPlug.pm
r2795 r3540 27 27 # assumes that gml tags are all in lower-case. 28 28 29 # 12/05/02 Added usage datastructure - John Thompson 30 29 31 package GMLPlug; 30 32 … … 37 39 } 38 40 41 my $options = { 'name' => "GMLPlug", 42 'desc' => "Plugin which processes a GML format document assumes that gml tags are all in lower-case.", 43 'inherits' => "yes" }; 44 39 45 sub new { 40 46 my ($class) = @_; 41 47 my $self = new BasPlug ("GMLPlug", @_); 48 49 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 50 my $option_list = $self->{'option_list'}; 51 push( @{$option_list}, $options ); 42 52 43 53 return bless $self, $class;} -
trunk/gsdl/perllib/plugins/HBPlug.pm
r2327 r3540 38 38 # Humanity Library collections 39 39 40 # 12/05/02 Added usage datastructure - John Thompson 41 40 42 package HBPlug; 41 43 … … 50 52 } 51 53 54 my $options = { 'name' => "HBPlug", 55 'desc' => "Plugin which processes an HTML book directory. This plugin is used by the Humanity Library collections and does not handle input encodings other than ascii or extended ascii. This code is kind of ugly and could no doubt be made to run faster, by leaving it in this state I hope to encourage people to make their collections use HBSPlug instead ;-)\n\nUse HBSPlug if creating a new collection and marking up files like the Humanity Library collections. HBSPlug accepts all input encodings but expects the marked up files to be cleaner than those used by the Humanity Library collections", 56 'inherits' => "yes" }; 57 52 58 sub new { 53 59 my ($class) = @_; 54 60 my $self = new BasPlug ("HBPlug", @_); 61 62 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 63 my $option_list = $self->{'option_list'}; 64 push( @{$option_list}, $options ); 55 65 56 66 return bless $self, $class; … … 67 77 die "ERROR: HBPlug can handle only iso_8859_1 or ascii encodings.\n" . 68 78 $self->{'input_encoding'} . " is not an acceptable input_encoding value\n"; 69 } 79 } 70 80 } 71 81 -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r3539 r3540 47 47 @ISA = ('BasPlug'); 48 48 } 49 50 my $arguments = [ { 'name' => "process_exp", 51 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 52 'type' => "string", 53 'deft' => q^(?i)(\.html?|\.shtml|\.shm|\.asp|\.php|\.cgi|.+\?.+=.*)$^ }, 54 { 'name' => "block_exp", 55 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 56 'type' => 'string', 57 'deft' => q^(?i)\.(gif|jpe?g|png|css)$^ }, 58 { 'name' => "nolinks", 59 'desc' => "Don't make any attempt to trap links (setting this flag may improve speed of building/importing but any relative links within documents will be broken).", 60 'type' => "flag" }, 61 { 'name' => "keep_head", 62 'desc' => "Don't remove headers from html files.", 63 'type' => "flag" }, 64 { 'name' => "no_metadata", 65 'desc' => "Don't attempt to extract any metadata from files.", 66 'type' => "flag" }, 67 { 'name' => "metadata_fields", 68 'desc' => "Comma separated list of metadata fields to attempt to extract. Defaults to 'Title'. Use 'tag<tagname>' to have the contents of the first <tagname > pair put in a metadata element called 'tagname'. Capitalise this as you want the metadata capitalised in Greenstone, since the tag extraction is case insensitive.", 69 'type' => "metadatum", 70 'deft' => "" }, 71 { 'name' => "hunt_creator_metadata", 72 'desc' => "Find as much metadata as possible on authorship and place it in the 'Creator' field. Requires the -metadata_fields flag.", 73 'type' => "flag" }, 74 { 'name' => "file_is_url", 75 'desc' => "Set if input filenames make up url of original source documents e.g. if a web mirroring tool was used to create the import directory structure.", 76 'type' => "flag" }, 77 { 'name' => "assoc_files", 78 'desc' => "Perl regular expression of file extensions to associate with html documents. Defaults to '(?i)\.(jpe?g|gif|png|css)\$'", 79 'type' => "string", 80 'deft' => q^(?i)\.(jpe?g|gif|png|css)\$^ }, 81 { 'name' => "rename_assoc_files", 82 'desc' => "Renames files associated with documents (e.g. images). Also creates much shallower directory structure (useful when creating collections to go on cd-rom).", 83 'type' => "flag" } , 84 { 'name' => "title_sub", 85 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PDFPlug to remove \"Page 1\", etc from text used as the title.", 86 'type' => "string" } , 87 { 'name' => "description_tags", 88 'desc' => "Split document into sub-sections where <Section> tags occur. Note that by setting this option you implicitly set -no_metadata, as all metadata should be included within the <Section> tags. Also, '-keep_head' will have no effect when this option is set.", 89 'type' => "flag" } ]; 90 91 my $options = { 'name' => "HTMLPlug", 92 'desc' => "This plugin processes HTML files", 93 'inherits' => "yes", 94 'args' => $arguments }; 49 95 50 96 sub print_usage { … … 92 138 my $self = new BasPlug ($class, @_); 93 139 140 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 141 my $option_list = $self->{'option_list'}; 142 push( @{$option_list}, $options ); 143 94 144 if (!parsargv::parse(\@_, 95 145 q^nolinks^, \$self->{'nolinks'}, -
trunk/gsdl/perllib/plugins/ImagePlug.pm
r3517 r3540 33 33 34 34 35 my $arguments = [ { 'name' => "process_exp", 36 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 37 'type' => "string", 38 'deft' => q^(?i)(\.jpe?g|\.gif|\.png|\.bmp|\.xbm|\.tif?f)$^, 39 'reqd' => "no" }, 40 { 'name' => "noscaleup", 41 'desc' => "Don't scale up small images when making thumbnails.", 42 'type' => "flag", 43 'reqd' => "no" }, 44 { 'name' => "thumbnailsize", 45 'desc' => "Make thumbnails of size nxn.", 46 'type' => "int", 47 'reqd' => "no" }, 48 { 'name' => "thumbnailtype", 49 'desc' => "Make thumbnails in format 's'.", 50 'type' => "string", 51 'reqd' => "no" }, 52 { 'name' => "screenviewsize", 53 'desc' => "If set, makes an image of size n for screen display and sets Screen, ScreenSize, ScreenWidth and ScreenHeight metadata. By default it is not set.", 54 'type' => "int", 55 'reqd' => "no" }, 56 { 'name' => "screenviewtype", 57 'desc' => "If -screenviewsize is set, this sets the screen display image type. Defaults to jpg.", 58 'type' => "string", 59 'deft' => "jpg", 60 'reqd' => "no" }, 61 { 'name' => "convertto", 62 'desc' => "Convert main image to.", 63 'type' => "string", 64 'reqd' => "no" }, 65 { 'name' => "minimumsize", 66 'desc' => "Ignore images smaller than n bytes.", 67 'type' => "int", 68 'reqd' => "no" } ]; 69 70 my $options = { 'name' => "ImagePlug", 71 'desc' => "", 72 'inherits' => "yes", 73 'args' => $arguments }; 74 75 35 76 sub print_usage { 36 77 my ($plugin_name) = @_; … … 65 106 my $self = new BasPlug ("ImagePlug", @_); 66 107 108 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 109 my $option_list = $self->{'option_list'}; 110 push( @{$option_list}, $options ); 111 67 112 if (!parsargv::parse(\@_, 68 113 q^noscaleup^, \$self->{'noscaleup'}, -
trunk/gsdl/perllib/plugins/IndexPlug.pm
r1482 r3540 50 50 # named 'Subject'. 51 51 52 # 12/05/02 Added usage datastructure - John Thompson 53 52 54 package IndexPlug; 53 55 … … 62 64 } 63 65 66 my $options = { 'name' => "IndexPlug", 67 'desc' => "This recursive plugin processes an index.txt file. The index.txt file should contain the list of files to be included in the collection followed by any extra metadata to be associated with each file.\n\nThe index.txt file should be formatted as follows: The first line may be a key (beginning with key:) to name the metadata fields (e.g. key: Subject Organization Date). The following lines will contain a filename followed by the value that metadata entry is to be set to. (e.g. 'irma/iw097e 3.2 unesco 1993' will associate the metadata Subject=3.2, Organization=unesco, and Date=1993 with the file irma/iw097e if the above key line was used)\n\nNote that if any of the metadata fields use the Hierarchy classifier plugin then the value they're set to should correspond to the first field (the descriptor) in the appropriate classification file.\n\nMetadata values may be named separately using a tag (e.g. >Subject<3.2) and this will override any name given to them by the key line. If there's no key line any unnamed metadata value will be named 'Subject'..", 68 'inherits' => "yes" }; 69 64 70 sub new { 65 71 my ($class) = @_; 66 72 my $self = new BasPlug ("IndexPlug", @_); 73 74 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 75 my $option_list = $self->{'option_list'}; 76 push( @{$option_list}, $options ); 67 77 68 78 return bless $self, $class; -
trunk/gsdl/perllib/plugins/PDFPlug.pm
r3411 r3540 32 32 } 33 33 34 my $arguments = [ { 'name' => "process_exp", 35 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 36 'type' => "string", 37 'deft' => q^(?i)\.pdf$^, 38 'reqd' => "no" }, 39 { 'name' => "block_exp", 40 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 41 'type' => 'string', 42 'deft' => q^^ } 43 ]; 44 45 my $options = { 'name' => "PDFPlug", 46 'desc' => "Reasonably with-it pdf plugin.", 47 'inherits' => "yes", 48 'args' => $arguments }; 49 34 50 sub new { 35 51 my $class = shift (@_); … … 62 78 $self->{'use_sections'}=1; 63 79 } 80 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 81 my $option_list = $self->{'option_list'}; 82 push( @{$option_list}, $options ); 64 83 65 84 return bless $self, $class; -
trunk/gsdl/perllib/plugins/PSPlug.pm
r2979 r3540 24 24 ########################################################################### 25 25 26 # 12/05/02 Added usage datastructure - John Thompson 27 26 28 package PSPlug; 27 29 … … 33 35 } 34 36 37 my $arguments = [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => q^(?i)\.ps$^, 41 'reqd' => "no" }, 42 { 'name' => "block_exp", 43 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 44 'type' => 'string', 45 'deft' => q^(?i)\.(eps)$^ } 46 ]; 47 48 my $options = { 'name' => "PSPlug", 49 'desc' => "This might look VERY similar to the PDF plugin.", 50 'inherits' => "yes", 51 'args' => $arguments }; 52 35 53 sub new { 36 54 my $class = shift (@_); … … 39 57 40 58 my $self = new ConvertToPlug ($class, "-convert_to", "text", @_ , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); 59 60 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 61 my $option_list = $self->{'option_list'}; 62 push( @{$option_list}, $options ); 41 63 42 64 if (!parsargv::parse(\@_, -
trunk/gsdl/perllib/plugins/RTFPlug.pm
r2979 r3540 25 25 ########################################################################### 26 26 27 # 12/05/02 Added usage datastructure - John Thompson 28 27 29 package RTFPlug; 28 30 … … 31 33 sub BEGIN { 32 34 @ISA = ('ConvertToPlug'); 35 } 36 37 my $arguments = [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => q^(?i)\.rtf$^, 41 'reqd' => "no" } 42 ]; 43 44 my $options = { 'name' => "RTFPlug", 45 'desc' => "Plugin for importing Rich Text Format files.", 46 'inherits' => "yes", 47 'args' => $arguments }; 48 49 sub new { 50 my $class = shift (@_); 51 my $self = new ConvertToPlug ($class, @_); 52 53 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 54 my $option_list = $self->{'option_list'}; 55 push( @{$option_list}, $options ); 56 57 return bless $self, $class; 33 58 } 34 59 … … 42 67 43 68 my $outhandle = $self->{'outhandle'}; 44 print $outhandle "RTFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" 69 print $outhandle "RTFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" 45 70 if $self->{'verbosity'} > 1; 46 71 -
trunk/gsdl/perllib/plugins/RecPlug.pm
r3116 r3540 106 106 use XML::Parser; 107 107 108 my $arguments = [ { 'name' => "block_exp", 109 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 110 'type' => "string", 111 'deft' => "CVS", 112 'reqd' => "no" }, 113 { 'name' => "use_metadata_files", 114 'desc' => "Read metadata from metadata XML files.", 115 'type' => "flag", 116 'reqd' => "no" } ]; 117 118 my $options = { 'name' => "RecPlug", 119 'desc' => "RecPlug is a plugin which recurses through directories processing 120 # each file it finds. For detailed comments edit <GSDLHOME>/perllib/plugins/RecPlug.pm .", 121 'inherits' => "yes", 122 'args' => $arguments }; 123 108 124 sub print_usage { 109 125 my ($plugin_name) = @_; … … 124 140 $self = new BasPlug ($class, @_); 125 141 142 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 143 my $option_list = $self->{'option_list'}; 144 push( @{$option_list}, $options ); 145 126 146 if (!parsargv::parse(\@_, 127 147 q^use_metadata_files^, \$self->{'use_metadata_files'}, -
trunk/gsdl/perllib/plugins/ReferPlug.pm
r1676 r3540 25 25 # 26 26 ########################################################################### 27 28 27 29 28 # ReferPlug reads bibliography files in Refer format. … … 62 61 # 63 62 63 # 12/05/02 Added usage datastructure - John Thompson 64 64 65 65 package ReferPlug; 66 66 67 67 use SplitPlug; 68 69 68 70 69 # ReferPlug is a sub-class of BasPlug. … … 73 72 } 74 73 74 my $arguments = [ { 'name' => "process_exp", 75 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 76 'type' => "string", 77 'deft' => q^(?i)\.bib$^, 78 'reqd' => "no" } ]; 79 80 my $options = { 'name' => "ReferPlug", 81 'desc' => "ReferPlug reads bibliography files in Refer format.\nBy Gordon W. Paynter (gwp\@cs.waikato.ac.nz), November 2000\n\nLoosely based on hcibib2Plug by Steve Jones (stevej\@cs.waikato.ac.nz). Which was based on EMAILPlug by Gordon Paynter (gwp\@cs.waikato.ac.nz). Which was based on old versions of HTMLplug and HCIBIBPlugby by Stefan Boddie and others -- it's hard to tell what came from where, now.\n\nReferPlug creates a document object for every reference in the file. It is a subclass of SplitPlug, so if there are multiple records, all are read.\n\nDocument text:\n\tThe document text consists of the reference in Refer format.\nMetadata:\n\t\$Creator \%A Author name\n\t\$Title \%T Title of article of book\n\t\$Journal \%J Title of Journal\n\t\$Booktitle \%B Title of book containing the publication\n\t\$Report \%R Type of Report, paper or thesis\n\t\$Volume \%V Volume Number of Journal\n\t\$Number \%N Number of Journal within Volume\n\t\$Editor \%E Editor name\n\t\$Pages \%P Page Number of article\n\t\$Publisher \%I Name of Publisher\n\t\$Publisheraddr \%C Publisher's address\n\t\$Date \%D Date of publication\n\t\$Keywords \%K Keywords associated with publication\n\t\$Abstract \%X Abstract of publication\n\t\$Copyright\t\%* Copyright information for the article", 82 'inherits' => "yes", 83 'args' => $arguments }; 84 75 85 # This plugin processes files with the suffix ".bib" 76 86 sub get_default_process_exp { … … 83 93 } 84 94 95 sub new { 96 my $class = shift (@_); 97 my $self = new SplitPlug ($class, @_); 98 99 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 100 my $option_list = $self->{'option_list'}; 101 push( @{$option_list}, $options ); 102 103 return bless $self, $class; 104 } 85 105 86 106 # The process function reads a single bibliogrphic record and stores -
trunk/gsdl/perllib/plugins/SRCPlug.pm
r2657 r3540 35 35 # Shell (currently only done as text) 36 36 37 # 12/05/02 Added usage datastructure - John Thompson 37 38 38 39 package SRCPlug; … … 45 46 } 46 47 48 my $arguments = [ { 'name' => "process_exp", 49 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 50 'type' => "string", 51 'deft' => q^(Makefile.*|README.*|(?i)\.(c|cc|cpp|C|h|hpp|pl|pm|sh))$^, 52 'reqd' => "no" } , 53 { 'name' => "block_exp", 54 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 55 'type' => 'string', 56 'deft' => q^(?i)\.(o|obj|a|so|dll)$^, 57 'reqd' => "no" } , 58 { 'name' => "remove_prefix", 59 'desc' => "Remove this leading pattern from the filename (eg -remove_prefix /tmp/XX/src/). The default is to remove the whole path from the filename.", 60 'type' => 'string', 61 'reqd' => "no" } ]; 62 63 my $options = { 'name' => "SRCPlug", 64 'desc' => "Filename is currently used for Title ( optionally minus some prefix ). Current languages:\ntext: READMEs/Makefiles\nC/C++ (currently extracts #include statements and C++ class decls)\nPerl (currently only done as text)\nShell (currently only done as text)", 65 'inherits' => "yes", 66 'args' => $arguments }; 47 67 48 68 sub print_usage { … … 60 80 my ($class) = @_; 61 81 my $self = new BasPlug ($class, @_); 82 83 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 84 my $option_list = $self->{'option_list'}; 85 push( @{$option_list}, $options ); 62 86 63 87 if (!parsargv::parse(\@_, -
trunk/gsdl/perllib/plugins/SplitPlug.pm
r3537 r3540 49 49 } 50 50 51 my $options = { 'name' => "SplitPlug", 52 'desc' => "SplitPlug is a plugin for splitting input files into segments that will then be individually processed. This plugin should not be called directly. Instead, if you need to process input files that contain several documents, you should write a plugin with a process function that will handle one of those documents and have it inherit from SplitPlug. See ReferPlug for an example.", 53 'inherits' => "yes" }; 54 55 51 56 sub new { 52 57 my ($class) = @_; 53 58 $self = new BasPlug($class, @_); 54 59 60 61 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 62 my $option_list = $self->{'option_list'}; 63 push( @{$option_list}, $options ); 64 55 65 if (!parsargv::parse(\@_, 56 66 q^split_exp/.*/^, \$self->{'split_exp'}, -
trunk/gsdl/perllib/plugins/TEXTPlug.pm
r3037 r3540 27 27 # of first line of text (up to 100 characters long). 28 28 29 # 12/05/02 Added usage datastructure - John Thompson 30 29 31 package TEXTPlug; 30 32 … … 36 38 @ISA = ('BasPlug'); 37 39 } 40 41 my $arguments = [ { 'name' => "process_exp", 42 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 43 'type' => "string", 44 'deft' => q^(?i)\.te?xt$^, 45 'reqd' => "no" } , 46 { 'name' => "title_sub", 47 'desc' => "Substitution expression to modify string stored as Title. Used by, for example, PSPlug to remove \"Page 1\" etc from text used as the title.", 48 'type' => "string", 49 'reqd' => "no" }]; 50 51 my $options = { 'name' => "TEXTPlug", 52 'desc' => "Creates simple single-level document. Adds Title metadata of first line of text (up to 100 characters long).", 53 'inherits' => "yes", 54 'args' => $arguments }; 38 55 39 56 sub print_usage { … … 50 67 my ($class) = @_; 51 68 my $self = new BasPlug ($class, @_); 69 70 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 71 my $option_list = $self->{'option_list'}; 72 push( @{$option_list}, $options ); 52 73 53 74 if (!parsargv::parse(\@_, … … 87 108 $title =~ /^\s+/s; 88 109 if (defined $self->{'title_sub'} && 89 110 $self->{'title_sub'}) {$title =~ s/$self->{'title_sub'}//;} 90 111 $title =~ /^\s*([^\n]*)/s; $title=$1; 91 112 if (length($title) > 100) { -
trunk/gsdl/perllib/plugins/WordPlug.pm
r3400 r3540 24 24 ########################################################################### 25 25 26 # 12/05/02 Added usage datastructure - John Thompson 27 26 28 package WordPlug; 27 29 … … 32 34 } 33 35 36 my $arguments = [ { 'name' => "process_exp", 37 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 38 'type' => "string", 39 'deft' => q^(?i)\.doc$^, 40 'reqd' => "no" } ]; 41 42 my $options = { 'name' => "WordPlug", 43 'desc' => "", 44 'inherits' => "yes", 45 'args' => $arguments }; 46 34 47 sub new { 35 48 my $class = shift (@_); 36 49 37 50 my $self = new ConvertToPlug ($class, @_); 51 52 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 53 my $option_list = $self->{'option_list'}; 54 push( @{$option_list}, $options ); 38 55 39 56 # wvWare will always produce html files encoded as utf-8 … … 57 74 58 75 my $outhandle = $self->{'outhandle'}; 59 print $outhandle "WordPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" 76 print $outhandle "WordPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" 60 77 if $self->{'verbosity'} > 1; 61 78 -
trunk/gsdl/perllib/plugins/XMLPlug.pm
r3107 r3540 35 35 36 36 use XML::Parser; 37 my $arguments = [ { 'name' => "process_exp", 38 'desc' => "A perl regular expression to match against filenames. Matching filenames will be processed by this plugin. Each plugin has its own default process_exp. e.g HTMLPlug defaults to '(?i)\.html?\$' i.e. all documents ending in .htm or .html (case-insensitive).", 39 'type' => "string", 40 'deft' => q^(?i)\.xml$^, 41 'reqd' => "no" } ]; 42 43 my $options = { 'name' => "XMLPlug", 44 'desc' => "", 45 'inherits' => "yes", 46 'args' => $arguments }; 47 37 48 38 49 my ($self); … … 43 54 $self = new BasPlug ($class, @_); 44 55 56 57 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 58 my $option_list = $self->{'option_list'}; 59 push( @{$option_list}, $options ); 60 45 61 my $parser = new XML::Parser('Style' => 'Stream', 46 62 'Handlers' => {'Char' => \&Char, -
trunk/gsdl/perllib/plugins/ZIPPlug.pm
r2795 r3540 43 43 # tar (for tar) 44 44 45 # 12/05/02 Added usage datastructure - John Thompson 46 45 47 package ZIPPlug; 46 48 … … 55 57 } 56 58 59 my $options = { 'name' => "ZIPPlug", 60 'desc' => "Plugin which handles compressed and/or archived input formats currently handled formats and file extensions are:\ngzip (.gz, .z, .tgz, .taz)\nbzip (.bz)\nbzip2 (.bz2)\nzip (.zip .jar)\ntar (.tar)\n\nThis plugin relies on the following utilities being present (if trying to process the corresponding formats):\ngunzip (for gzip)\nbunzip (for bzip)\nbunzip2 \nunzip (for zip)\ntar (for tar)", 61 'inherits' => "yes" }; 62 57 63 sub new { 58 64 my ($class) = @_; 59 65 my $self = new BasPlug ("ZIPPlug", @_); 66 67 # 14-05-02 To allow for proper inheritance of arguments - John Thompson 68 my $option_list = $self->{'option_list'}; 69 push( @{$option_list}, $options ); 60 70 61 71 return bless $self, $class;
Note:
See TracChangeset
for help on using the changeset viewer.