Changeset 1242
- Timestamp:
- 2000-06-27T09:36:01+12:00 (24 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r1229 r1242 29 29 use multiread; 30 30 use cnseg; 31 use acronym; 31 32 use strict; 33 use doc; 32 34 33 35 sub print_usage { 34 print STDERR "\nOne of your plugins uses an incorrect general option (general options are those\n"; 36 my ($plugin_name) = @_; 37 38 print STDERR "\nThe $plugin_name plugin uses an incorrect general option (general options are those\n"; 35 39 print STDERR "available to all plugins). Check your collect.cfg configuration file.\n"; 36 40 37 print STDERR "\n usage: plugin plugin-name [options]\n\n";41 print STDERR "\n usage: plugin $plugin_name [options]\n\n"; 38 42 print STDERR " currently supported general options are:\n"; 39 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 40 print STDERR " converted from these encodings and stored internally as\n"; 41 print STDERR " utf8. The default input_encoding is Latin1. Accepted values\n"; 42 print STDERR " are:\n"; 43 print STDERR " iso_8859_1 (extended ascii)\n"; 44 print STDERR " Latin1 (the same as iso-8859-1)\n"; 45 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n"; 46 print STDERR " conversion is neccessary)\n"; 47 print STDERR " gb (GB or GBK simplified Chinese)\n"; 48 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 49 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n"; 50 print STDERR " Arabic (the same as windows_1256)\n"; 51 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 52 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; 53 print STDERR " detection)\n\n"; 43 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 44 print STDERR " converted from these encodings and stored internally as\n"; 45 print STDERR " utf8. The default input_encoding is Latin1. Accepted values\n"; 46 print STDERR " are:\n"; 47 print STDERR " iso_8859_1 (extended ascii)\n"; 48 print STDERR " Latin1 (the same as iso-8859-1)\n"; 49 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n"; 50 print STDERR " conversion is neccessary)\n"; 51 print STDERR " gb (GB or GBK simplified Chinese)\n"; 52 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 53 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n"; 54 print STDERR " Arabic (the same as windows_1256)\n"; 55 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 56 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; 57 print STDERR " detection)\n"; 58 print STDERR " -process_exp A perl regular expression to match against filenames.\n"; 59 print STDERR " Matching filenames will be processed by this plugin.\n"; 60 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n"; 61 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n"; 62 print STDERR " .htm or .html (case-insensitive).\n"; 63 print STDERR " -block_exp Files matching this regular expression will be blocked from\n"; 64 print STDERR " being passed to any further plugins in the list. This has no\n"; 65 print STDERR " real effect other than to prevent lots of warning messages\n"; 66 print STDERR " about input files you don't care about. Each plugin may or may\n"; 67 print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n"; 68 print STDERR " any files with .gif, .jpg, .jpeg, .png, .pdf, .rtf or .css\n"; 69 print STDERR " file extensions.\n"; 70 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 54 71 } 55 72 56 73 sub new { 57 74 my $class = shift (@_); 75 my $plugin_name = shift (@_); 58 76 59 77 my $self = {}; … … 61 79 62 80 # general options available to all plugins 63 if (!parsargv::parse(\@_, "input_encoding/$encodings/Latin1", \$self->{'input_encoding'}, 81 if (!parsargv::parse(\@_, 82 qq^input_encoding/$encodings/Latin1^, \$self->{'input_encoding'}, 83 q^process_exp/.*/^, \$self->{'process_exp'}, 84 q^block_exp/.*/^, \$self->{'block_exp'}, 85 q^extract_acronyms^, \$self->{'extract_acronyms'}, 64 86 "allow_extra_options")) { 65 &print_usage( );87 &print_usage($plugin_name); 66 88 die "\n"; 67 89 } 68 90 69 91 return bless $self, $class; 92 } 93 94 # initialize BasPlug options 95 # if init() is overridden in a sub-class, remember to call BasPlug::init() 96 sub init { 97 my $self = shift (@_); 98 my ($verbosity) = @_; 99 100 # verbosity is passed through from the processor 101 $self->{'verbosity'} = $verbosity; 102 103 # set process_exp and block_exp to defaults unless they were 104 # explicitly set 105 if ((!$self->is_recursive()) && 106 (!defined $self->{'process_exp'}) || ($self->{'process_exp'} eq "")) { 107 108 $self->{'process_exp'} = $self->get_default_process_exp (); 109 if ($self->{'process_exp'} eq "") { 110 warn ref($self) . " Warning: Non-recursive plugin has no process_exp so will have no effect\n"; 111 } 112 } 113 114 if ((!defined $self->{'block_exp'}) || ($self->{'block_exp'} eq "")) { 115 $self->{'block_exp'} = $self->get_default_block_exp (); 116 } 70 117 } 71 118 … … 79 126 } 80 127 81 # return 1 if this class might recurse using $pluginfo 128 # this function should be overridden to return 1 129 # in recursive plugins 82 130 sub is_recursive { 83 131 my $self = shift (@_); 84 132 85 die "BasPlug::is_recursive function must be implemented in sub classes\n"; 86 } 87 88 # return number of files processed, undef if can't process 133 return 0; 134 } 135 136 sub get_default_block_exp { 137 my $self = shift (@_); 138 139 return ""; 140 } 141 142 sub get_default_process_exp { 143 my $self = shift (@_); 144 145 return ""; 146 } 147 148 # The BasPlug read() function. This function does all the right things 149 # to make general options work for a given plugin. It calls the process() 150 # function which does all the work specific to a plugin (like the old 151 # read functions used to do). Most plugins should define their own 152 # process() function and let this read() function keep control. 153 # 154 # Return number of files processed, undef if can't process 89 155 # Note that $base_dir might be "" and that $file might 90 156 # include directories 157 91 158 sub read { 92 159 my $self = shift (@_); 93 160 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 94 161 95 die "BasPlug::read function must be implemented in sub classes\n"; 96 97 return undef; # will never get here 162 if ($self->is_recursive()) { 163 die "BasPlug::read function must be implemented in sub-class for recursive plugins\n"; 164 } 165 166 my $filename = &util::filename_cat($base_dir, $file); 167 return 0 if $filename =~ /$self->{'block_exp'}/; 168 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) { 169 return undef; 170 } 171 my $plugin_name = ref ($self); 172 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 173 174 # create a new document 175 my $doc_obj = new doc ($file, "indexed_doc"); 176 my $cursection = 177 178 # read in file ($text will be in utf8) 179 my $text = ""; 180 $self->read_file ($filename, \$text); 181 182 if ($text !~ /\w/) { 183 print STDERR "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 184 return 0; 185 } 186 187 # include any metadata passed in from previous plugins 188 # note that this metadata is associated with the top level section 189 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata); 190 191 # do plugin specific processing of doc_obj 192 $self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj); 193 194 # add text 195 $doc_obj->add_utf8_text ($cursection, $text); 196 197 # do any automatic metadata extraction 198 $self->auto_extract_metadata ($doc_obj); 199 200 # add an OID 201 $doc_obj->set_OID(); 202 203 # process the document 204 $processor->process($doc_obj); 205 206 return 1; # processed the file 207 } 208 209 sub process { 210 my $self = shift (@_); 211 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_; 212 213 die "Basplug::process function must be implemented in sub-class\n"; 98 214 } 99 215 … … 157 273 } 158 274 275 # extract acronyms (and hopefully other stuff soon too). 276 sub auto_extract_metadata { 277 my $self = shift (@_); 278 my ($doc_obj) = @_; 279 280 if ($self->{'extract_acronyms'}) { 281 my $thissection = $doc_obj->get_top_section(); 282 while (defined $thissection) { 283 my $text = $doc_obj->get_text($thissection); 284 $self->extract_acronyms (\$text, $doc_obj, $thissection) if $text =~ /./; 285 $thissection = $doc_obj->get_next_section ($thissection); 286 } 287 } 288 } 289 290 sub extract_acronyms { 291 my $self = shift (@_); 292 my ($textref, $doc_obj, $thissection) = @_; 293 294 my $acro_array = &acronym::acronyms($textref); 295 296 foreach my $acro (@$acro_array) { 297 298 #do the normal acronym 299 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string()); 300 print "found " . $acro->to_string() . "\n"; 301 302 # do the KWIC (Key Word In Context) acronym 303 my @kwic = $acro->to_string_kwic(); 304 foreach my $kwic (@kwic) { 305 $doc_obj->add_utf8_metadata($thissection, "AcronymKWIC", $kwic); 306 print "found (KWIC)" . $kwic . "\n"; 307 } 308 } 309 } 310 159 311 1;
Note:
See TracChangeset
for help on using the changeset viewer.