Changeset 1844 for trunk/gsdl/perllib/plugins/BasPlug.pm
- Timestamp:
- 2001-01-19T10:35:13+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r1838 r1844 34 34 use diagnostics; 35 35 use DateExtract; 36 use iso639; 37 38 # if textcat returns an encoding that isn't in this list 39 # we'll print a warning and use the default encoding instead 40 %supported_encodings = ( 41 "ascii" => "", 42 "iso_8859_1" => "", 43 "windows_1252" => "", 44 "iso_8859_2" => "", 45 "windows_1250" => "", 46 "iso_8859_3" => "", 47 "iso_8859_4" => "", 48 "iso_8859_5" => "", 49 "windows_1251" => "", 50 "koi8_r" => "", 51 "koi8_u" => "", 52 "iso_8859_6" => "", 53 "windows_1256" => "", 54 "iso_8859_7" => "", 55 "windows_1253" => "", 56 "iso_8859_8" => "", 57 "windows_1255" => "", 58 "iso_8859_9" => "", 59 "windows_1254" => "", 60 "gb" => "" 61 ); 36 62 37 63 sub print_general_usage { … … 39 65 40 66 print STDERR "\n usage: plugin $plugin_name [options]\n\n"; 41 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 42 print STDERR " converted from these encodings and stored internally as\n"; 43 print STDERR " utf8. The default input_encoding is ascii. Accepted values\n"; 44 print STDERR " are:\n"; 45 print STDERR " iso_8859_1 (extended ascii)\n"; 46 print STDERR " Latin1 (the same as iso-8859-1)\n"; 47 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n"; 48 print STDERR " conversion is neccessary)\n"; 49 print STDERR " gb (GB or GBK simplified Chinese)\n"; 50 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 51 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n"; 52 print STDERR " Arabic (the same as windows_1256)\n"; 53 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 54 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; 55 print STDERR " detection)\n"; 56 print STDERR " windows_1251 (Windows codepage 1251 (Cyrillic))\n"; 67 57 68 print STDERR " -process_exp A perl regular expression to match against filenames.\n"; 58 69 print STDERR " Matching filenames will be processed by this plugin.\n"; 59 70 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n"; 60 71 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n"; 61 print STDERR " .htm or .html (case-insensitive).\n"; 72 print STDERR " .htm or .html (case-insensitive).\n\n"; 73 62 74 print STDERR " -block_exp Files matching this regular expression will be blocked from\n"; 63 75 print STDERR " being passed to any further plugins in the list. This has no\n"; … … 66 78 print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n"; 67 79 print STDERR " any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n"; 68 print STDERR " file extensions.\n"; 80 print STDERR " file extensions.\n\n"; 81 82 83 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 84 print STDERR " converted from these encodings and stored internally as\n"; 85 print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n"; 86 print STDERR " are:\n"; 87 88 print STDERR " auto: Use text categorization algorithm to automatically\n"; 89 print STDERR " identify the encoding of each source document. This\n"; 90 print STDERR " will be slower than explicitly setting the encoding\n"; 91 print STDERR " but will work where more than one encoding is used\n"; 92 print STDERR " within the same collection.\n"; 93 94 print STDERR " ascii: Plain 7 bit ascii. This may be a little faster than\n"; 95 print STDERR " using iso_8859_1. Beware of using 'ascii' on a collection\n"; 96 print STDERR " of documents that may contain characters outside of plain\n"; 97 print STDERR " 7 bit ascii though (e.g. German or French documents\n"; 98 print STDERR " containing accents), use iso_8859_1 instead.\n"; 99 100 print STDERR " utf8: either utf8 or unicode -- automatically detected\n"; 101 print STDERR " unicode: just unicode\n"; 102 103 print STDERR " iso_8859_1: Latin1 (western european languages)\n"; 104 print STDERR " windows_1252: Windows codepage 1252 (WinLatin1)\n"; 105 106 print STDERR " iso_8859_2: Latin2 (central and eastern european languages)\n"; 107 print STDERR " windows_1250: Windows codepage 1250 (WinLatin2)\n"; 108 109 print STDERR " iso_8859_3: Latin3\n"; 110 111 print STDERR " iso_8859_4: Latin4\n"; 112 113 print STDERR " iso_8859_5: Cyrillic\n"; 114 print STDERR " windows_1251: Windows codepage 1251 (WinCyrillic)\n"; 115 print STDERR " koi8_r: Cyrillic - Russian\n"; 116 print STDERR " koi8_u: Cyrillic - Ukrainian\n"; 117 118 print STDERR " iso_8859_6: Arabic\n"; 119 print STDERR " windows_1256: Windows codepage 1256 (WinArabic)\n"; 120 121 print STDERR " iso_8859_7: Greek\n"; 122 print STDERR " windows_1253: Windows codepage 1253 (WinGreek)\n"; 123 124 print STDERR " iso_8859_8: Hebrew\n"; 125 print STDERR " windows_1255: Windows codepage 1255 (WinHebrew)\n"; 126 127 print STDERR " iso_8859_9: Latin5\n"; 128 print STDERR " windows_1254: Windows codepage 1254 (WinTurkish)\n"; 129 130 print STDERR " gb: GB or GBK simplified Chinese\n\n"; 131 132 print STDERR " -default_encoding If -input_encoding is set to 'auto' and the text categorization\n"; 133 print STDERR " algorithm fails to extract the encoding or extracts an encoding\n"; 134 print STDERR " that is not supported by Greenstone, this encoding will be used\n"; 135 print STDERR " instead. The default is iso_8859_1\n\n"; 136 137 print STDERR " -extract_language Identify the language of each document and set 'Language' metadata. Note\n"; 138 print STDERR " that this will be done automatically if -input_encoding is 'auto'.\n"; 139 print STDERR " -default_language If Greenstone fails to work out what language a document is the\n"; 140 print STDERR " 'Language' metadata element will be set to this value. The default\n"; 141 print STDERR " is 'en' (ISO 639 language symbols should be used - en = English).\n"; 142 print STDERR " Note that if -input_encoding is not set to 'auto' and -extract_language\n"; 143 print STDERR " is not set, all documents will have their 'Language' metadata set to\n"; 144 print STDERR " this value.\n\n"; 145 69 146 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 70 print STDERR " -markup_acronyms Added acronym metadata into document text\n\n"; 71 print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n"; 72 print STDERR " -first Comma seperated list of first sizes to extract from the text \n"; 73 print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n"; 147 148 print STDERR " -markup_acronyms Add acronym metadata into document text\n\n"; 149 150 print STDERR " -first Comma seperated list of first sizes to extract from the text\n"; 151 print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n\n"; 152 74 153 print STDERR " -extract_email Extract email addresses as metadata\n\n"; 154 75 155 print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n\n"; 76 156 } … … 86 166 my $class = shift (@_); 87 167 my $plugin_name = shift (@_); 88 89 168 my $self = {}; 90 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode|windows_1251)\$"; 169 170 my $enc = "^("; 171 map {$enc .= "|$_";} keys %supported_encodings; 172 my $denc = $enc . "|utf8|unicode)\$"; 173 $enc .= "|utf8|unicode|auto)\$"; 174 91 175 $self->{'outhandle'} = STDERR; 92 176 my $year = (localtime)[5]+1900; … … 94 178 # general options available to all plugins 95 179 if (!parsargv::parse(\@_, 96 qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},97 180 q^process_exp/.*/^, \$self->{'process_exp'}, 98 181 q^block_exp/.*/^, \$self->{'block_exp'}, 182 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 183 qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'}, 99 184 q^extract_acronyms^, \$self->{'extract_acronyms'}, 100 185 q^extract_email^, \$self->{'extract_email'}, 101 186 q^markup_acronyms^, \$self->{'markup_acronyms'}, 102 187 q^extract_language^, \$self->{'extract_language'}, 188 q^default_language/.{2}/en^, \$self->{'default_language'}, 103 189 q^first/.*/^, \$self->{'first'}, 104 190 q^extract_date^, \$self->{'date_extract'}, 105 "maximum_date/\\d{4}/$year", \$self->{'max_year'},191 qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'}, 106 192 q^no_bibliography^, \$self->{'no_biblio'}, 107 "maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1", 108 \$self->{'max_century'}, 193 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'}, 109 194 "allow_extra_options")) { 110 195 … … 145 230 $self->{'block_exp'} = $self->get_default_block_exp (); 146 231 } 147 148 # handle input_encoding aliases149 $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";150 $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";151 232 } 152 233 … … 204 285 } 205 286 287 my $outhandle = $self->{'outhandle'}; 288 206 289 my $filename = &util::filename_cat($base_dir, $file); 207 290 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; … … 211 294 my $plugin_name = ref ($self); 212 295 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 213 296 297 my ($language, $encoding); 298 if ($self->{'input_encoding'} eq "auto") { 299 # use textcat to automatically work out the input encoding and language 300 ($language, $encoding) = $self->get_language_encoding ($filename); 301 302 } elsif ($self->{'extract_language'}) { 303 # use textcat to get language metadata 304 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 305 $encoding = $self->{'input_encoding'}; 306 307 if ($extracted_encoding != $encoding && $self->{'verbosity'}) { 308 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 309 print $outhandle "appears to be encoded as $extracted_encoding."; 310 } 311 312 } else { 313 $language = $self->{'default_language'}; 314 $encoding = $self->{'input_encoding'}; 315 } 316 214 317 # create a new document 215 318 my $doc_obj = new doc ($filename, "indexed_doc"); 319 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 320 $doc_obj->set_source_encoding ($encoding); 321 216 322 217 323 # read in file ($text will be in utf8) 218 324 my $text = ""; 219 $self->read_file ($filename, \$text); 220 221 if ($text !~ /\w/) { 222 my $outhandle = $self->{'outhandle'}; 325 $self->read_file ($filename, $encoding, \$text); 326 327 if (!length ($text)) { 223 328 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 224 329 return 0; … … 260 365 sub read_file { 261 366 my $self = shift (@_); 262 my ($filename, $ textref) = @_;367 my ($filename, $encoding, $textref) = @_; 263 368 264 369 if (!-r $filename) 265 370 { 266 print STDERR "Read permission denied for $filename\n"; 371 my $outhandle = $self->{'outhandle'}; 372 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 267 373 return; 268 374 } … … 272 378 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; 273 379 274 if ($ self->{'input_encoding'}eq "ascii") {380 if ($encoding eq "ascii") { 275 381 undef $/; 276 382 $$textref = <FILE>; … … 279 385 my $reader = new multiread(); 280 386 $reader->set_handle ('BasPlug::FILE'); 281 $reader->set_encoding ($ self->{'input_encoding'});387 $reader->set_encoding ($encoding); 282 388 $reader->read_file ($textref); 283 389 284 if ($ self->{'input_encoding'}eq "gb") {390 if ($encoding eq "gb") { 285 391 # segment the Chinese words 286 392 $$textref = &cnseg::segment($$textref); … … 289 395 290 396 close FILE; 397 } 398 399 # Uses textcat to work out the encoding and language of the text in 400 # $filename. All html tags are removed before processing. 401 # returns an array containing "language" and "encoding" 402 sub get_language_encoding { 403 my $self = shift (@_); 404 my ($filename) = @_; 405 my $outhandle = $self->{'outhandle'}; 406 407 # read in file 408 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n"; 409 undef $/; 410 my $text = <FILE>; 411 $/ = "\n"; 412 close FILE; 413 414 # remove all HTML tags 415 $text =~ s/<[^>]*>//sg; 416 417 # get the language/encoding 418 my @results = textcat::classify($text); 419 420 # foreach $i (@results) { 421 # print STDERR "i: $i\n"; 422 # } 423 424 if (scalar @results != 1) { 425 if ($self->{'input_encoding'} ne 'auto') { 426 if ($self->{'extract_language'} && $self->{'verbosity'}) { 427 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; 428 print $outhandle "defaulting to $self->{'default_language'}\n"; 429 } 430 return ($self->{'default_language'}, $self->{'input_encoding'}); 431 432 } else { 433 if ($self->{'verbosity'}) { 434 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - "; 435 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n"; 436 } 437 return ($self->{'default_language'}, $self->{'default_encoding'}); 438 } 439 } 440 441 # format language/encoding 442 my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/; 443 $language = $iso639::toiso639{lc($language)}; 444 die "Invalid language\n" if !defined $language; 445 446 if (!defined $encoding) { 447 # if textcat returned no encoding info it is assumed to be iso_8859_1 448 $encoding = "iso_8859_1"; 449 } else { 450 # convert to the format we expect 451 $encoding =~ s/windows/windows_/; 452 $encoding =~ s/iso8859/iso_8859/; 453 $encoding =~ s/^gb.*$/gb/; 454 } 455 456 if (!defined $supported_encodings{$encoding}) { 457 if ($self->{'verbosity'}) { 458 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - "; 459 print $outhandle "using $self->{'default_encoding'}\n"; 460 } 461 $encoding = $self->{'default_encoding'}; 462 } 463 464 return ($language, $encoding); 291 465 } 292 466 … … 351 525 352 526 print $outhandle " extracting email addresses ...\n" 353 if ($self->{'verbosity'} > =2);527 if ($self->{'verbosity'} > 2); 354 528 355 529 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g); … … 362 536 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); 363 537 print $outhandle " extracting $address\n" 364 if ($self->{'verbosity'} > =3);538 if ($self->{'verbosity'} > 3); 365 539 } 366 540 } 367 541 print $outhandle " done extracting email addresses.\n" 368 if ($self->{'verbosity'} > =2);542 if ($self->{'verbosity'} > 2); 369 543 370 544 } … … 437 611 } 438 612 439 440 # Identify the language of a section and add it to the metadata441 sub extract_language {442 my $self = shift (@_);443 my ($textref, $doc_obj, $thissection) = @_;444 445 # remove all HTML tags446 my $text = $$textref;447 $text =~ s/<P[^>]*>/\n/sgi;448 $text =~ s/<H[^>]*>/\n/sgi;449 $text =~ s/<[^>]*>//sgi;450 $text =~ tr/\n/\n/s;451 452 # get the language453 my @results = textcat::classify($text);454 @results = ("unknown") if ($#results > 2);455 456 # create language string and remove encoding information457 my $language = join(" or ", @results);458 $language =~ s/\-\w+//g;459 $doc_obj->add_utf8_metadata($thissection, "Language", $language);460 # print "Language: ", time, "-> $language\n";461 462 }463 464 613 # extract acronyms from a section in a document. progress is 465 614 # reported to outhandle based on the verbosity. both the Acronym … … 472 621 473 622 print $outhandle " extracting acronyms ...\n" 474 if ($self->{'verbosity'} > =2);623 if ($self->{'verbosity'} > 2); 475 624 476 625 my $acro_array = &acronym::acronyms($textref); … … 496 645 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string()); 497 646 print $outhandle " adding ". $acro->to_string() . "\n" 498 if ($self->{'verbosity'} > =3);647 if ($self->{'verbosity'} > 3); 499 648 500 649 } 501 650 } 502 651 print $outhandle " done extracting acronyms. \n" 503 if ($self->{'verbosity'} > =2);652 if ($self->{'verbosity'} > 2); 504 653 } 505 654 … … 510 659 511 660 print $outhandle " marking up acronyms ...\n" 512 if ($self->{'verbosity'} > =2);661 if ($self->{'verbosity'} > 2); 513 662 514 663 #self is passed in to check for verbosity ... … … 516 665 517 666 print $outhandle " done marking up acronyms. \n" 518 if ($self->{'verbosity'} > =2);667 if ($self->{'verbosity'} > 2); 519 668 520 669 return $text; … … 522 671 523 672 1; 524 525 526
Note:
See TracChangeset
for help on using the changeset viewer.