Changeset 1844
- Timestamp:
- 2001-01-19T10:35:13+13:00 (23 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/doc.pm
r1732 r1844 130 130 } 131 131 132 sub set_source_encoding { 133 my $self = shift (@_); 134 my ($source_encoding) = @_; 135 136 $self->set_metadata_element ($self->get_top_section(), 137 "gsdlsourceencoding", 138 $source_encoding); 139 } 140 141 # returns the source_encoding as it was provided 142 sub get_source_encoding { 143 my $self = shift (@_); 144 145 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourceencoding"); 146 } 147 132 148 sub _escape_text { 133 149 my ($text) = @_; -
trunk/gsdl/perllib/docsave.pm
r1454 r1844 239 239 "gsdlassocfile", 240 240 "$afile:$assoc_file->[2]:$dir"); 241 } els e{241 } elsif ($self->{'verbosity'} > 2) { 242 242 print $outhandle "docsave::process couldn't copy the associated file " . 243 243 "$assoc_file->[0] to $afile\n"; -
trunk/gsdl/perllib/multiread.pm
r1838 r1844 26 26 # encodings currently supported are 27 27 # 28 # utf8 - either utf8 or unicode (automatically detected) 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # iso_8859_1 - extended ascii (iso-8859-1) 32 # iso_8859_6 - 8 bit arabic (iso-8859-6) 33 # windows_1256 - Windows codepage 1256 (Arabic) 34 # windows_1251 - Windows codepage 1251 (Cyrillic) 28 # utf8 - either utf8 or unicode (automatically detected) 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # iso_8859_[1-9] - 8 bit extended ascii encodings 32 # windows_125[0-6] - Windows codepages 1250 to 1256 35 33 36 34 package multiread; … … 172 170 173 171 if ($self->{'encoding'} eq "iso_8859_1") { 174 # Latin 1 extended ascii (ISO-8859-1) 172 # special case for iso_8859_1 as &ascii2utf8($char) is faster than 173 # &unicode2utf8(iso2unicode('1', $char)) 175 174 return undef if (eof ($handle)); 176 175 return &unicode::ascii2utf8 (getc ($handle)); 177 176 } 178 179 if ($self->{'encoding'} eq "iso_8859_6") { 180 # 8 bit Arabic (IOS-8859-6) 181 return undef if (eof ($handle)); 182 return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle))); 183 } 184 185 if ($self->{'encoding'} eq "windows_1256") { 186 # Windows 1256 (Arabic) 187 return undef if (eof ($handle)); 188 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle))); 189 } 190 191 if ($self->{'encoding'} eq "windows_1251") { 192 # Windows 1251 (Cyrillic) 193 return undef if (eof ($handle)); 194 return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", getc ($handle))); 177 178 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 179 return undef if (eof ($handle)); 180 return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle))); 181 } 182 183 if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 184 return undef if (eof ($handle)); 185 return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle))); 186 } 187 188 if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 189 return undef if (eof ($handle)); 190 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle))); 195 191 } 196 192 … … 257 253 258 254 if ($self->{'encoding'} eq "iso_8859_1") { 259 # extended ascii (ISO-8859-1) 255 # special case for iso_8859_1 as &ascii2utf8($line) is faster than 256 # &unicode2utf8(iso2unicode('1', $line)) 260 257 my $line = ""; 261 258 if (defined ($line = <$handle>)) { … … 265 262 } 266 263 267 if ($self->{'encoding'} eq "iso_8859_6") { 268 # 8 bit arabic (ISO-8859-6) 269 my $line = ""; 270 if (defined ($line = <$handle>)) { 271 return &unicode::unicode2utf8(&unicode::arabic2unicode ($line)); 272 } 273 return undef; 274 } 275 276 if ($self->{'encoding'} eq "windows_1256") { 277 # Windows 1256 (Arabic) 278 my $line = ""; 279 if (defined ($line = <$handle>)) { 280 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line)); 281 } 282 return undef; 283 } 284 285 if ($self->{'encoding'} eq "windows_1251") { 286 # Windows 1251 (Cyrillic) 287 my $line = ""; 288 if (defined ($line = <$handle>)) { 289 return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $line)); 264 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 265 my $line = ""; 266 if (defined ($line = <$handle>)) { 267 return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line)); 268 } 269 return undef; 270 } 271 272 if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 273 my $line = ""; 274 if (defined ($line = <$handle>)) { 275 return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line)); 276 } 277 return undef; 278 } 279 280 if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 281 my $line = ""; 282 if (defined ($line = <$handle>)) { 283 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line)); 290 284 } 291 285 return undef; … … 339 333 340 334 if ($self->{'encoding'} eq "iso_8859_1") { 335 # special case for iso_8859_1 as &ascii2utf8($text) is faster than 336 # &unicode2utf8(iso2unicode('1', $text)) 341 337 undef $/; 342 338 my $text = <$handle>; … … 346 342 } 347 343 348 if ($self->{'encoding'} eq "iso_8859_6") { 349 my $text = <$handle>; 350 undef $/; 351 $/ = "\n"; 352 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text)); 353 return; 354 } 355 356 if ($self->{'encoding'} eq "windows_1256") { 357 undef $/; 358 my $text = <$handle>; 359 $/ = "\n"; 360 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text)); 361 return; 362 } 363 364 if ($self->{'encoding'} eq "windows_1251") { 365 undef $/; 366 my $text = <$handle>; 367 $/ = "\n"; 368 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $text)); 369 return; 370 } 371 } 372 344 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 345 undef $/; 346 my $text = <$handle>; 347 $/ = "\n"; 348 $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text)); 349 return; 350 } 351 352 if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 353 undef $/; 354 my $text = <$handle>; 355 $/ = "\n"; 356 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text)); 357 return; 358 } 359 360 if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 361 undef $/; 362 my $text = <$handle>; 363 $/ = "\n"; 364 $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text)); 365 return; 366 } 367 } 373 368 374 369 1; -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1838 r1844 34 34 use diagnostics; 35 35 use DateExtract; 36 use iso639; 37 38 # if textcat returns an encoding that isn't in this list 39 # we'll print a warning and use the default encoding instead 40 %supported_encodings = ( 41 "ascii" => "", 42 "iso_8859_1" => "", 43 "windows_1252" => "", 44 "iso_8859_2" => "", 45 "windows_1250" => "", 46 "iso_8859_3" => "", 47 "iso_8859_4" => "", 48 "iso_8859_5" => "", 49 "windows_1251" => "", 50 "koi8_r" => "", 51 "koi8_u" => "", 52 "iso_8859_6" => "", 53 "windows_1256" => "", 54 "iso_8859_7" => "", 55 "windows_1253" => "", 56 "iso_8859_8" => "", 57 "windows_1255" => "", 58 "iso_8859_9" => "", 59 "windows_1254" => "", 60 "gb" => "" 61 ); 36 62 37 63 sub print_general_usage { … … 39 65 40 66 print STDERR "\n usage: plugin $plugin_name [options]\n\n"; 41 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 42 print STDERR " converted from these encodings and stored internally as\n"; 43 print STDERR " utf8. The default input_encoding is ascii. Accepted values\n"; 44 print STDERR " are:\n"; 45 print STDERR " iso_8859_1 (extended ascii)\n"; 46 print STDERR " Latin1 (the same as iso-8859-1)\n"; 47 print STDERR " ascii (7 bit ascii -- may be faster than Latin1 as no\n"; 48 print STDERR " conversion is neccessary)\n"; 49 print STDERR " gb (GB or GBK simplified Chinese)\n"; 50 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 51 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n"; 52 print STDERR " Arabic (the same as windows_1256)\n"; 53 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 54 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; 55 print STDERR " detection)\n"; 56 print STDERR " windows_1251 (Windows codepage 1251 (Cyrillic))\n"; 67 57 68 print STDERR " -process_exp A perl regular expression to match against filenames.\n"; 58 69 print STDERR " Matching filenames will be processed by this plugin.\n"; 59 70 print STDERR " Each plugin has its own default process_exp. e.g HTMLPlug\n"; 60 71 print STDERR " defaults to '(?i)\.html?\$' i.e. all documents ending in\n"; 61 print STDERR " .htm or .html (case-insensitive).\n"; 72 print STDERR " .htm or .html (case-insensitive).\n\n"; 73 62 74 print STDERR " -block_exp Files matching this regular expression will be blocked from\n"; 63 75 print STDERR " being passed to any further plugins in the list. This has no\n"; … … 66 78 print STDERR " not have a default block_exp. e.g. by default HTMLPlug blocks\n"; 67 79 print STDERR " any files with .gif, .jpg, .jpeg, .png, .rtf or .css\n"; 68 print STDERR " file extensions.\n"; 80 print STDERR " file extensions.\n\n"; 81 82 83 print STDERR " -input_encoding The encoding of the source documents. Documents will be\n"; 84 print STDERR " converted from these encodings and stored internally as\n"; 85 print STDERR " utf8. The default input_encoding is 'auto'. Accepted values\n"; 86 print STDERR " are:\n"; 87 88 print STDERR " auto: Use text categorization algorithm to automatically\n"; 89 print STDERR " identify the encoding of each source document. This\n"; 90 print STDERR " will be slower than explicitly setting the encoding\n"; 91 print STDERR " but will work where more than one encoding is used\n"; 92 print STDERR " within the same collection.\n"; 93 94 print STDERR " ascii: Plain 7 bit ascii. This may be a little faster than\n"; 95 print STDERR " using iso_8859_1. Beware of using 'ascii' on a collection\n"; 96 print STDERR " of documents that may contain characters outside of plain\n"; 97 print STDERR " 7 bit ascii though (e.g. German or French documents\n"; 98 print STDERR " containing accents), use iso_8859_1 instead.\n"; 99 100 print STDERR " utf8: either utf8 or unicode -- automatically detected\n"; 101 print STDERR " unicode: just unicode\n"; 102 103 print STDERR " iso_8859_1: Latin1 (western european languages)\n"; 104 print STDERR " windows_1252: Windows codepage 1252 (WinLatin1)\n"; 105 106 print STDERR " iso_8859_2: Latin2 (central and eastern european languages)\n"; 107 print STDERR " windows_1250: Windows codepage 1250 (WinLatin2)\n"; 108 109 print STDERR " iso_8859_3: Latin3\n"; 110 111 print STDERR " iso_8859_4: Latin4\n"; 112 113 print STDERR " iso_8859_5: Cyrillic\n"; 114 print STDERR " windows_1251: Windows codepage 1251 (WinCyrillic)\n"; 115 print STDERR " koi8_r: Cyrillic - Russian\n"; 116 print STDERR " koi8_u: Cyrillic - Ukrainian\n"; 117 118 print STDERR " iso_8859_6: Arabic\n"; 119 print STDERR " windows_1256: Windows codepage 1256 (WinArabic)\n"; 120 121 print STDERR " iso_8859_7: Greek\n"; 122 print STDERR " windows_1253: Windows codepage 1253 (WinGreek)\n"; 123 124 print STDERR " iso_8859_8: Hebrew\n"; 125 print STDERR " windows_1255: Windows codepage 1255 (WinHebrew)\n"; 126 127 print STDERR " iso_8859_9: Latin5\n"; 128 print STDERR " windows_1254: Windows codepage 1254 (WinTurkish)\n"; 129 130 print STDERR " gb: GB or GBK simplified Chinese\n\n"; 131 132 print STDERR " -default_encoding If -input_encoding is set to 'auto' and the text categorization\n"; 133 print STDERR " algorithm fails to extract the encoding or extracts an encoding\n"; 134 print STDERR " that is not supported by Greenstone, this encoding will be used\n"; 135 print STDERR " instead. The default is iso_8859_1\n\n"; 136 137 print STDERR " -extract_language Identify the language of each document and set 'Language' metadata. Note\n"; 138 print STDERR " that this will be done automatically if -input_encoding is 'auto'.\n"; 139 print STDERR " -default_language If Greenstone fails to work out what language a document is the\n"; 140 print STDERR " 'Language' metadata element will be set to this value. The default\n"; 141 print STDERR " is 'en' (ISO 639 language symbols should be used - en = English).\n"; 142 print STDERR " Note that if -input_encoding is not set to 'auto' and -extract_language\n"; 143 print STDERR " is not set, all documents will have their 'Language' metadata set to\n"; 144 print STDERR " this value.\n\n"; 145 69 146 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 70 print STDERR " -markup_acronyms Added acronym metadata into document text\n\n"; 71 print STDERR " -extract_langauge Identify the language of the text and set as metadata\n\n"; 72 print STDERR " -first Comma seperated list of first sizes to extract from the text \n"; 73 print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n"; 147 148 print STDERR " -markup_acronyms Add acronym metadata into document text\n\n"; 149 150 print STDERR " -first Comma seperated list of first sizes to extract from the text\n"; 151 print STDERR " into a metadata field. The fields are called 'FirstNNN'.\n\n"; 152 74 153 print STDERR " -extract_email Extract email addresses as metadata\n\n"; 154 75 155 print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n\n"; 76 156 } … … 86 166 my $class = shift (@_); 87 167 my $plugin_name = shift (@_); 88 89 168 my $self = {}; 90 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode|windows_1251)\$"; 169 170 my $enc = "^("; 171 map {$enc .= "|$_";} keys %supported_encodings; 172 my $denc = $enc . "|utf8|unicode)\$"; 173 $enc .= "|utf8|unicode|auto)\$"; 174 91 175 $self->{'outhandle'} = STDERR; 92 176 my $year = (localtime)[5]+1900; … … 94 178 # general options available to all plugins 95 179 if (!parsargv::parse(\@_, 96 qq^input_encoding/$encodings/ascii^, \$self->{'input_encoding'},97 180 q^process_exp/.*/^, \$self->{'process_exp'}, 98 181 q^block_exp/.*/^, \$self->{'block_exp'}, 182 qq^input_encoding/$enc/auto^, \$self->{'input_encoding'}, 183 qq^default_encoding/$denc/iso_8859_1^, \$self->{'default_encoding'}, 99 184 q^extract_acronyms^, \$self->{'extract_acronyms'}, 100 185 q^extract_email^, \$self->{'extract_email'}, 101 186 q^markup_acronyms^, \$self->{'markup_acronyms'}, 102 187 q^extract_language^, \$self->{'extract_language'}, 188 q^default_language/.{2}/en^, \$self->{'default_language'}, 103 189 q^first/.*/^, \$self->{'first'}, 104 190 q^extract_date^, \$self->{'date_extract'}, 105 "maximum_date/\\d{4}/$year", \$self->{'max_year'},191 qq^maximum_date/\\d{4}/$year^, \$self->{'max_year'}, 106 192 q^no_bibliography^, \$self->{'no_biblio'}, 107 "maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1", 108 \$self->{'max_century'}, 193 qq^maximum_century/-?\\d{1,2}( ?B\\.C\\.E\\.)?/-1^, \$self->{'max_century'}, 109 194 "allow_extra_options")) { 110 195 … … 145 230 $self->{'block_exp'} = $self->get_default_block_exp (); 146 231 } 147 148 # handle input_encoding aliases149 $self->{'input_encoding'} = "iso_8859_1" if $self->{'input_encoding'} eq "Latin1";150 $self->{'input_encoding'} = "windows_1256" if $self->{'input_encoding'} eq "Arabic";151 232 } 152 233 … … 204 285 } 205 286 287 my $outhandle = $self->{'outhandle'}; 288 206 289 my $filename = &util::filename_cat($base_dir, $file); 207 290 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/; … … 211 294 my $plugin_name = ref ($self); 212 295 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up 213 296 297 my ($language, $encoding); 298 if ($self->{'input_encoding'} eq "auto") { 299 # use textcat to automatically work out the input encoding and language 300 ($language, $encoding) = $self->get_language_encoding ($filename); 301 302 } elsif ($self->{'extract_language'}) { 303 # use textcat to get language metadata 304 ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 305 $encoding = $self->{'input_encoding'}; 306 307 if ($extracted_encoding != $encoding && $self->{'verbosity'}) { 308 print $outhandle "$plugin_name: WARNING: $file was read using $encoding encoding but "; 309 print $outhandle "appears to be encoded as $extracted_encoding."; 310 } 311 312 } else { 313 $language = $self->{'default_language'}; 314 $encoding = $self->{'input_encoding'}; 315 } 316 214 317 # create a new document 215 318 my $doc_obj = new doc ($filename, "indexed_doc"); 319 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 320 $doc_obj->set_source_encoding ($encoding); 321 216 322 217 323 # read in file ($text will be in utf8) 218 324 my $text = ""; 219 $self->read_file ($filename, \$text); 220 221 if ($text !~ /\w/) { 222 my $outhandle = $self->{'outhandle'}; 325 $self->read_file ($filename, $encoding, \$text); 326 327 if (!length ($text)) { 223 328 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'}; 224 329 return 0; … … 260 365 sub read_file { 261 366 my $self = shift (@_); 262 my ($filename, $ textref) = @_;367 my ($filename, $encoding, $textref) = @_; 263 368 264 369 if (!-r $filename) 265 370 { 266 print STDERR "Read permission denied for $filename\n"; 371 my $outhandle = $self->{'outhandle'}; 372 print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 267 373 return; 268 374 } … … 272 378 open (FILE, $filename) || die "BasPlug::read_file could not open $filename for reading ($!)\n"; 273 379 274 if ($ self->{'input_encoding'}eq "ascii") {380 if ($encoding eq "ascii") { 275 381 undef $/; 276 382 $$textref = <FILE>; … … 279 385 my $reader = new multiread(); 280 386 $reader->set_handle ('BasPlug::FILE'); 281 $reader->set_encoding ($ self->{'input_encoding'});387 $reader->set_encoding ($encoding); 282 388 $reader->read_file ($textref); 283 389 284 if ($ self->{'input_encoding'}eq "gb") {390 if ($encoding eq "gb") { 285 391 # segment the Chinese words 286 392 $$textref = &cnseg::segment($$textref); … … 289 395 290 396 close FILE; 397 } 398 399 # Uses textcat to work out the encoding and language of the text in 400 # $filename. All html tags are removed before processing. 401 # returns an array containing "language" and "encoding" 402 sub get_language_encoding { 403 my $self = shift (@_); 404 my ($filename) = @_; 405 my $outhandle = $self->{'outhandle'}; 406 407 # read in file 408 open (FILE, $filename) || die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n"; 409 undef $/; 410 my $text = <FILE>; 411 $/ = "\n"; 412 close FILE; 413 414 # remove all HTML tags 415 $text =~ s/<[^>]*>//sg; 416 417 # get the language/encoding 418 my @results = textcat::classify($text); 419 420 # foreach $i (@results) { 421 # print STDERR "i: $i\n"; 422 # } 423 424 if (scalar @results != 1) { 425 if ($self->{'input_encoding'} ne 'auto') { 426 if ($self->{'extract_language'} && $self->{'verbosity'}) { 427 print $outhandle "BasPlug: WARNING: language could not be extracted from $filename - "; 428 print $outhandle "defaulting to $self->{'default_language'}\n"; 429 } 430 return ($self->{'default_language'}, $self->{'input_encoding'}); 431 432 } else { 433 if ($self->{'verbosity'}) { 434 print $outhandle "BASPlug: WARNING: language/encoding could not be extracted from $filename - "; 435 print $outhandle "defaulting to $self->{'default_language'}/$self->{'default_encoding'}\n"; 436 } 437 return ($self->{'default_language'}, $self->{'default_encoding'}); 438 } 439 } 440 441 # format language/encoding 442 my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/; 443 $language = $iso639::toiso639{lc($language)}; 444 die "Invalid language\n" if !defined $language; 445 446 if (!defined $encoding) { 447 # if textcat returned no encoding info it is assumed to be iso_8859_1 448 $encoding = "iso_8859_1"; 449 } else { 450 # convert to the format we expect 451 $encoding =~ s/windows/windows_/; 452 $encoding =~ s/iso8859/iso_8859/; 453 $encoding =~ s/^gb.*$/gb/; 454 } 455 456 if (!defined $supported_encodings{$encoding}) { 457 if ($self->{'verbosity'}) { 458 print $outhandle "BasPlug: WARNING: $filename appears to be encoded in an unsupported encoding ($encoding) - "; 459 print $outhandle "using $self->{'default_encoding'}\n"; 460 } 461 $encoding = $self->{'default_encoding'}; 462 } 463 464 return ($language, $encoding); 291 465 } 292 466 … … 351 525 352 526 print $outhandle " extracting email addresses ...\n" 353 if ($self->{'verbosity'} > =2);527 if ($self->{'verbosity'} > 2); 354 528 355 529 my @email = ($$textref =~ m/([-a-z0-9\.@+_=]+@(?:[-a-z0-9]+\.)+(?:com|org|edu|mil|int|[a-z][a-z]))/g); … … 362 536 $doc_obj->add_utf8_metadata ($thissection, "emailAddress", $address); 363 537 print $outhandle " extracting $address\n" 364 if ($self->{'verbosity'} > =3);538 if ($self->{'verbosity'} > 3); 365 539 } 366 540 } 367 541 print $outhandle " done extracting email addresses.\n" 368 if ($self->{'verbosity'} > =2);542 if ($self->{'verbosity'} > 2); 369 543 370 544 } … … 437 611 } 438 612 439 440 # Identify the language of a section and add it to the metadata441 sub extract_language {442 my $self = shift (@_);443 my ($textref, $doc_obj, $thissection) = @_;444 445 # remove all HTML tags446 my $text = $$textref;447 $text =~ s/<P[^>]*>/\n/sgi;448 $text =~ s/<H[^>]*>/\n/sgi;449 $text =~ s/<[^>]*>//sgi;450 $text =~ tr/\n/\n/s;451 452 # get the language453 my @results = textcat::classify($text);454 @results = ("unknown") if ($#results > 2);455 456 # create language string and remove encoding information457 my $language = join(" or ", @results);458 $language =~ s/\-\w+//g;459 $doc_obj->add_utf8_metadata($thissection, "Language", $language);460 # print "Language: ", time, "-> $language\n";461 462 }463 464 613 # extract acronyms from a section in a document. progress is 465 614 # reported to outhandle based on the verbosity. both the Acronym … … 472 621 473 622 print $outhandle " extracting acronyms ...\n" 474 if ($self->{'verbosity'} > =2);623 if ($self->{'verbosity'} > 2); 475 624 476 625 my $acro_array = &acronym::acronyms($textref); … … 496 645 $doc_obj->add_utf8_metadata($thissection, "Acronym", $acro->to_string()); 497 646 print $outhandle " adding ". $acro->to_string() . "\n" 498 if ($self->{'verbosity'} > =3);647 if ($self->{'verbosity'} > 3); 499 648 500 649 } 501 650 } 502 651 print $outhandle " done extracting acronyms. \n" 503 if ($self->{'verbosity'} > =2);652 if ($self->{'verbosity'} > 2); 504 653 } 505 654 … … 510 659 511 660 print $outhandle " marking up acronyms ...\n" 512 if ($self->{'verbosity'} > =2);661 if ($self->{'verbosity'} > 2); 513 662 514 663 #self is passed in to check for verbosity ... … … 516 665 517 666 print $outhandle " done marking up acronyms. \n" 518 if ($self->{'verbosity'} > =2);667 if ($self->{'verbosity'} > 2); 519 668 520 669 return $text; … … 522 671 523 672 1; 524 525 526 -
trunk/gsdl/perllib/plugins/HTMLPlug.pm
r1699 r1844 377 377 $value =~ s/\"$//; 378 378 $value =~ s/\s+/ /gs; 379 print "adding Creator of $value\n";380 379 $doc_obj->add_utf8_metadata($section, "Creator", $value); 381 380 print $outhandle " extracted Creator metadata \"$value\"\n" 382 if ($self->{'verbosity'} > =2);381 if ($self->{'verbosity'} > 2); 383 382 next; 384 383 } … … 405 404 $doc_obj->add_utf8_metadata($section, $field, $value); 406 405 print $outhandle " extracted \"$field\" metadata \"$value\"\n" 407 if ($self->{'verbosity'} > =2);406 if ($self->{'verbosity'} > 2); 408 407 next; 409 408 } … … 427 426 $doc_obj->add_utf8_metadata ($section, $field, $title); 428 427 print $outhandle " extracted \"$field\" metadata \"$title\"\n" 429 if ($self->{'verbosity'} > =2);428 if ($self->{'verbosity'} > 2); 430 429 next; 431 430 } … … 445 444 $doc_obj->add_utf8_metadata ($section, $field, $tmptext); 446 445 print $outhandle " extracted \"$field\" metadata \"$tmptext\"\n" 447 if ($self->{'verbosity'} > =2);446 if ($self->{'verbosity'} > 2); 448 447 next; 449 448 } … … 467 466 $doc_obj->add_utf8_metadata ($section, $tag, $word); 468 467 print $outhandle " extracted \"$tag\" metadata \"$word\"\n" 469 if ($self->{'verbosity'} > =2);468 if ($self->{'verbosity'} > 2); 470 469 } 471 470 } -
trunk/gsdl/perllib/unicode.pm
r1227 r1844 49 49 } 50 50 51 # arabic2unicode takes an 8 bit Arabic string (ISO-8859-6) 52 # and returns a unicode array 53 sub arabic2unicode { 54 my ($in) = @_; 55 my $out = []; 56 57 my $i = 0; 58 my $len = length($in); 59 while ($i < $len) { 60 my $c = ord(substr ($in, $i, 1)); 61 $c += (1567-191) if ($c >= 0x80); 62 push (@$out, $c); 63 $i++; 64 } 65 66 return $out; 67 } 68 69 # windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic)) 51 # windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic)) 70 52 # and returns a unicode array. These encodings are similar to but not 71 53 # identical to the corresponding ISO-8859 encodings. 54 # 55 # $encoding should be the code page name (e.g. '1252') 72 56 # 73 57 # The map files for these encodings should be in unicode/MAPPINGS/WINDOWS … … 78 62 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 79 63 "WINDOWS", "$encoding.TXT"); 64 return $out unless &loadmapping ($encoding, $mapfile); 65 66 my $i = 0; 67 my $len = length($in); 68 while ($i < $len) { 69 my $c = ord(substr ($in, $i, 1)); 70 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80); 71 push (@$out, $c); 72 $i++; 73 } 74 75 return $out; 76 } 77 78 # iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic)) 79 # and returns a unicode array. This function is much like windows2unicode() 80 # except that only characters >= 0xA0 are read from the mapping file (since 81 # all characters below that are the same for all iso-8859 character sets 82 # and therefore already the same as unicode). 83 # 84 # Note that while this function will work for iso-8859-1 (latin 1) it'll be 85 # much faster to use ascii2unicode() or ascii2utf8() 86 # 87 # $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the 88 # encoding is 89 # 90 # The map files for these encodings should be in unicode/MAPPINGS/ISO_8859 91 sub iso2unicode { 92 my ($encoding, $in) = @_; 93 my $out = []; 94 95 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 96 "ISO_8859", "$encoding.TXT"); 97 return $out unless &loadmapping ($encoding, $mapfile); 98 99 my $i = 0; 100 my $len = length($in); 101 while ($i < $len) { 102 my $c = ord(substr ($in, $i, 1)); 103 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0); 104 push (@$out, $c); 105 $i++; 106 } 107 108 return $out; 109 } 110 111 # cyrillic2unicode is basically identical to windows2unicode, the only 112 # difference being that the map files live in unicode/MAPPINGS/CYRILLIC 113 # 114 # values for $encoding may be 'koi8_r' or 'koi8_u' 115 sub cyrillic2unicode { 116 my ($encoding, $in) = @_; 117 my $out = []; 118 119 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 120 "CYRILLIC", "$encoding.txt"); 80 121 return $out unless &loadmapping ($encoding, $mapfile); 81 122
Note:
See TracChangeset
for help on using the changeset viewer.