Changeset 34122
- Timestamp:
- 2020-05-26T01:13:33+12:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm
r34121 r34122 35 35 # both sorted by ex.srcURL, and an ex.Title classifier. 36 36 # For the ex.srcDomain classifier, set removeprefix to: https?\:\/\/(www\.)? 37 # An alternative is to build that List classifier on ex.basicDomain instead of ex.srcDomain. 37 38 # Finally, in the "display" format statement, add the following before the "wrappedSectionText" to 38 39 # display the most relevant metadata of each record: … … 77 78 # </div> 78 79 79 # TODO: remove illegible values for metadata _rs_ and _csh_ in the example below before80 # + DONE: remove illegible values for metadata _rs_ and _csh_ in the example below before 80 81 # committing, in case their encoding affects the loading/reading in of this perl file. 81 82 # … … 102 103 # metadata CharEncodingForConversion : utf-8 103 104 # metadata OriginalCharEncoding : utf-8 104 # metadata _rs_ : ï¿œ105 # metadata _csh_ : 105 # metadata _rs_ : 106 # metadata _csh_ : 106 107 # text:start: 107 108 # Te Kura Kaupapa MÄori o Te WhÄnau Tahi He mihi He mihi Te Kaupapa NgÄ TÄngata Te KÄkano Te Pihinga Te Tipuranga Te PuÄwaitanga Te Tari Te Poari Matua WhakapÄ mai He mihi He mihi Te Kaupapa NgÄ TÄngata Te KÄkano Te Pihinga Te Tipuranga Te PuÄwaitanga Te Tari Te Poari Matua WhakapÄ mai TE KURA KAUPAPA MÄORI O TE WHÄNAU TAHI He mihi Kei te mÅteatea tonu nei ngÄ mahara ki te huhua kua mene atu ki te pÅ, te pÅuriuri, te pÅtangotango, te pÅ oti atu rÄ. Kua rite te wÄhanga ki a rÄtou, hoki mai ki te ao tÅ«roa nei Ko Io Matua Kore te pÅ«taketanga, te pÅ«kaea, te pÅ«tÄtara ka rangona whÄnuitia e te ao. Ko tÄna ko ngÄ whetÅ«, te marama, te haeata ki a Tamanui te rÄ. He atua i whakateretere mai ai ngÄ waka i tawhiti nui, i tawhiti roa, i tawhiti mai rÄ anÅ. Kei nga ihorei, kei ngÄ wahapÅ«, kei ngÄ pukumahara, kei ngÄ kanohi kai mÄtÄrae o tÅ tÄtou nei kura Aho Matua, Te Kura Kaupapa MÄori o Te Whanau Tahi. Anei rÄ te maioha ki a koutou katoa e pÅ«mau tonu ki ngÄ wawata me ngÄ whakakitenga i whakatakotoria e ngÄ poupou i te wÄ i a rÄtou. Ka whakanuia hoki te toru tekau tau o tÄnei kura mai i tÅna orokohanga timatanga tae noa ki tÄnei wÄ Ka pÅ«mau tÅnu mÄtou ki te whakatauki o te kura e mea ana âPoipoia Å tÄtou nei pÅ«manawaâ Takiritia tonutia te ra ki runga i Te Kura Kaupapa Maori o Te Whanau Tahi . Back to Top " Poipoia Å tÄtou nei pÅ«manawa -  Making our potential a reality "  © Te Kura Kaupapa MÄori o Te WhÄnau Tahi, 2019 Cart ( 0 ) … … 157 158 # - encoding = utf-8, changed to "utf8" as required by copied to_utf8(str) method. Why does it not convert 158 159 # the string parameter but fails in decode() step? Is it because the string is already in UTF8? 160 # - Problem converting text with encoding in full set of nutch dump.txt when there encoding is windows-1252. 161 # - TODOs 162 # 159 163 # - Should I add metadata as "ex."+meta or as meta? e.g. ex.srcURL or srcURL? 160 164 # - Want to read in keep_urls_file, maintaining a hashmap of its URLs, only on import, isn't that correct? 161 165 # Then how can I initialise this only once and only during import? constructor and init() methods are called during buildcol too. 162 166 # For now, I've done it in can_proc_this_file() but there must be a more appropriate place and correct way to do this? 163 # - TODOs164 167 # - why can't I do doc_obj->get_meta_element($section, "ex.srcURL") but have to pass "srcURL" and 1 to ignore 165 168 # namespace? … … 177 180 # Is this warning still necessary? 178 181 179 # methods defined in superclasses that have the same signature take 180 # precedence in the order given in the ISA list. We want MetaPlugins to 181 # call MetadataRead's can_process_this_file_for_metadata(), rather than 182 # calling BaseImporter's version of the same method, so list inherited 183 # superclasses in this order. 182 184 183 sub BEGIN { 185 184 @NutchTextDumpPlugin::ISA = ('SplitTextFile'); … … 216 215 my ($pluginlist,$inputargs,$hashArgOptLists) = @_; 217 216 push(@$pluginlist, $class); 218 217 219 218 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); 220 219 push(@{$hashArgOptLists->{"OptList"}},$options); 221 220 222 221 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists); 223 222 224 223 if ($self->{'info_only'}) { 225 224 # don't worry about the options 226 225 return bless $self, $class; 227 226 } 228 227 229 228 $self->{'keep_urls_processed'} = 0; 230 229 $self->{'keep_urls'} = undef; 231 $self->{'type'} = ""; # TODO: value can be 'ascii' or other. Used in MARCPlugin.pm. Keep this field here?232 230 233 231 #return bless $self, $class; 234 232 $self = bless $self, $class; 235 233 236 234 # Can only call any methods on $self AFTER the bless operation above 237 235 #$self->setup_keep_urls(); # want to set up the keep_urls hashmap only once, so have to do it here (init is also called by buildcol) 238 236 239 237 return $self; 240 238 } … … 282 280 my $self = shift (@_); 283 281 284 285 286 287 288 289 290 282 my $verbosity = $self->{'verbosity'}; 283 my $outhandle = $self->{'outhandle'}; 284 my $failhandle = $self->{'failhandle'}; 285 286 $self->{'keep_urls_processed'} = 1; # flag to track whether this method has been called already during import 287 288 #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls()\n"; 291 289 292 293 294 295 296 297 298 299 290 if(!$self->{'keep_urls_file'}) { 291 my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" . 292 " No records will be filtered.\n"; 293 print $outhandle $msg if ($verbosity > 2); 294 295 return; 296 } 297 300 298 # read in the keep urls files 301 299 my $keep_urls_file = &util::locate_config_file($self->{'keep_urls_file'}); 302 300 if (!defined $keep_urls_file) 303 301 { 304 305 306 307 308 309 310 311 302 my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" . 303 " No records will be filtered.\n"; 304 305 print $outhandle $msg; 306 307 $self->{'keep_urls'} = undef; 308 # TODO: Not a fatal error if $keep_urls_file can't be found: it just means all records 309 # in dump.txt will be processed? 312 310 } 313 311 else { 314 315 316 317 } 318 312 #$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle); 313 #$self->{'keep_urls'} = {}; 314 $self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle); 315 } 316 319 317 #if(defined $self->{'keep_urls'}) { 320 318 # print STDERR "@@@@ keep_urls hash map contains:\n"; 321 319 # map { print STDERR $_."=>".$self->{'keep_urls'}->{$_}."\n"; } keys %{$self->{'keep_urls'}}; 322 320 #} 323 321 324 322 } 325 323 … … 334 332 my $self = shift(@_); 335 333 my ($filename) = @_; 336 337 338 339 340 341 342 343 344 345 346 334 my $can_process_return_val = $self->SUPER::can_process_this_file(@_); 335 336 # We want to load in the keep_urls_file and create the keep_urls hashmap only once, during import 337 # Because the keep urls file can be large and it and the hashmap serve no purpose during buildcol.pl. 338 # Check whether we've already processed the file/built the hashmap, as we don't want to do this 339 # more than 1 time even within just the import cycle. 340 if($can_process_return_val && !$self->{'keep_urls_processed'}) { #!defined $self->{'keep_urls'}) { 341 $self->setup_keep_urls(); 342 } 343 344 return $can_process_return_val; 347 345 348 346 } … … 350 348 sub parse_keep_urls_file { 351 349 my $self = shift (@_); 352 my ($urls_file, $outhandle, $failhandle) = @_; 350 my ($urls_file, $outhandle, $failhandle) = @_; 351 352 # https://www.caveofprogramming.com/perl-tutorial/perl-hashes-a-guide-to-associative-arrays-in-perl.html 353 # https://stackoverflow.com/questions/1817394/whats-the-difference-between-a-hash-and-hash-reference-in-perl 354 $self->{'keep_urls'} = {}; # hash reference init to {} 353 355 354 # https://www.caveofprogramming.com/perl-tutorial/perl-hashes-a-guide-to-associative-arrays-in-perl.html 355 # https://stackoverflow.com/questions/1817394/whats-the-difference-between-a-hash-and-hash-reference-in-perl 356 #my %urls_map = (); # hash init to () 357 $self->{'keep_urls'} = {}; # hash reference init to {} 358 359 # What if it is a very long file of URLs? Need to read a line at a time! 360 #my $contents = &FileUtils::readUTF8File($urls_file); # could just call $self->read_file() inherited from SplitTextFile's parent ReadTextFile 361 #my @lines = split(/(?:\r?\n)+/, $$textref); 362 363 # Open the file in UTF-8 mode https://stackoverflow.com/questions/2220717/perl-read-file-with-encoding-method 364 # and read in line by line into map 365 my $fh; 366 if (open($fh,'<:encoding(UTF-8)', $urls_file)) { 367 while (defined (my $line = <$fh>)) { 368 $line = &util::trim($line); #$line =~ s/^\s+|\s+$//g; # trim whitespace 369 if($line =~ m@^https?://@) { # add only URLs 370 #%urls_map{$line} = 1; # add the url to our perl hash 371 $self->{'keep_urls'}->{$line} = 1; 372 } 373 } 374 close $fh; 375 } else { 376 my $msg = "NutchTextDumpPlugin ERROR: Unable to open file keep_urls_file: \"" . 377 $self->{'keep_urls_file'} . "\".\n " . 378 " No records will be filtered.\n"; 379 print $outhandle $msg; 380 print $failhandle $msg; 381 # Not fatal. TODO: should it be fatal when it can still process all URLs just because 382 # it can't find the specified keep-urls.txt file? 356 # What if it is a very long file of URLs? Need to read a line at a time! 357 #my $contents = &FileUtils::readUTF8File($urls_file); # could just call $self->read_file() inherited from SplitTextFile's parent ReadTextFile 358 #my @lines = split(/(?:\r?\n)+/, $$textref); 359 360 # Open the file in UTF-8 mode https://stackoverflow.com/questions/2220717/perl-read-file-with-encoding-method 361 # and read in line by line into map 362 my $fh; 363 if (open($fh,'<:encoding(UTF-8)', $urls_file)) { 364 while (defined (my $line = <$fh>)) { 365 $line = &util::trim($line); #$line =~ s/^\s+|\s+$//g; # trim whitespace 366 if($line =~ m@^https?://@) { # add only URLs 367 $self->{'keep_urls'}->{$line} = 1; # add the url to our perl hash 368 } 383 369 } 384 385 # if keep_urls hash is empty, ensure it is undefined from this point onward 386 # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl 387 my %urls_map = $self->{'keep_urls'}; 388 if(!keys %urls_map) { 389 $self->{'keep_urls'} = undef; 390 } 391 392 #return %urls_map; 393 } 394 370 close $fh; 371 } else { 372 my $msg = "NutchTextDumpPlugin ERROR: Unable to open file keep_urls_file: \"" . 373 $self->{'keep_urls_file'} . "\".\n " . 374 " No records will be filtered.\n"; 375 print $outhandle $msg; 376 print $failhandle $msg; 377 # Not fatal. TODO: should it be fatal when it can still process all URLs just because 378 # it can't find the specified keep-urls.txt file? 379 } 380 381 # if keep_urls hash is empty, ensure it is undefined from this point onward 382 # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl 383 my %urls_map = $self->{'keep_urls'}; 384 if(!keys %urls_map) { 385 $self->{'keep_urls'} = undef; 386 } 387 388 } 389 390 # Accept "dump.txt" files (which are in numeric siteID folders), 391 # and txt files with numeric siteID, e.g. "01441.txt" 392 # if I preprocessed dump.txt files by renaming them this way. 395 393 sub get_default_process_exp { 396 394 my $self = shift (@_); 397 395 398 396 return q^(?i)((dump|\d+)\.txt)$^; 399 397 } … … 401 399 402 400 sub get_default_split_exp { 403 401 404 402 # prev line is either a new line or start of dump.txt 405 403 # current line should start with url protocol and contain " key: .... http(s)/" 406 404 # \r\n for msdos eol, \n for unix 407 405 408 #return q^($|\r?\n)https?://\w+\s+key:\s+\w+https?/^; 409 #return q^\r?\n(text:end:|metadata _csh_ :)\r?\n\r?\n^; 410 411 #return q^(\r?\n)*https?://\w+\s+key:\s+\w+https?/\s*\r?\n^; 412 413 414 #return q^(?:$|\r?\n\r?\n)(https?://.+?\skey:\s+.*?https?/)^; 415 416 417 #return q^($|\r?\n\r?\n)https?://^; 418 419 #return q^\r?\n(text:end:)\r?\n\r?\n^; 420 421 # return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^; 422 423 424 # split by default throws away delimiter 406 # The regex return value of this method is passed into a call to perl split. 407 # Perl's split(), by default throws away delimiter 425 408 # Any capturing group that makes up or is part of the delimiter becomes a separate element returned by split 426 409 # We want to throw away the empty newlines preceding the first line of a record "https? .... key: https?/" … … 429 412 # https://stackoverflow.com/questions/14907772/split-but-keep-delimiter 430 413 # - To skip the unwanted empty lines preceding the first line of a record use ?: in front of its capture group 431 414 # to discard that group: 432 415 # https://stackoverflow.com/questions/3512471/what-is-a-non-capturing-group-in-regular-expressions 433 416 # - Next use a positive look-ahead (?= in front of capture group, vs ?! for negative look ahead) 434 435 417 # to match but not capture the first line of a record (so the look-ahead matched is retained as the 418 # first line of the next record): 436 419 # https://stackoverflow.com/questions/14907772/split-but-keep-delimiter 437 420 # and http://www.regular-expressions.info/lookaround.html … … 439 422 # https://stackoverflow.com/questions/11898998/how-can-i-write-a-regex-which-matches-non-greedy 440 423 return q^(?:$|\r?\n\r?\n)(?=https?://.+?\skey:\s+.*?https?/)^; 441 442 } 443 444 # TODO: COPIED METHOD STRAIGHT FROM MarcPlugin.pm - move to a utility perl file? 424 425 } 426 427 # TODO: Copied method from MARCPlugin.pm and uncommented return statement when encoding = utf8 428 # Move to a utility perl file, since code is mostly shared? 445 429 # The bulk of this function is based on read_line in multiread.pm 446 430 # Unable to use read_line original because it expects to get its input 447 431 # from a file. Here the line to be converted is passed in as a string 432 433 # TODO: 434 # Is this function even applicable to NutchTextDumpPlugin? 435 # I get errors in this method when encoding is utf-8 in the decode step. 436 # I get warnings/errors somewhere in this file (maybe also at decode) when encoding is windows-1252. 448 437 449 438 sub to_utf8 … … 454 443 if ($encoding eq "utf8") { 455 444 # nothing needs to be done 456 #return $line;457 } elsif ($encoding eq "iso_8859_1" ) {445 return $line; 446 } elsif ($encoding eq "iso_8859_1" || $encoding eq "windows-1252") { # TODO: do this also for windows-1252? 458 447 # we'll use ascii2utf8() for this as it's faster than going 459 448 # through convert2unicode() … … 462 451 } else { 463 452 464 465 453 # everything else uses unicode::convert2unicode 454 $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line)); 466 455 } 467 456 # At this point $line is a binary byte string … … 469 458 # Unicode aware pattern matching can be used. 470 459 # For instance: 's/\x{0101}//g' or '[[:upper:]]' 471 460 472 461 return decode ("utf8", $line); 473 462 } … … 480 469 my $self = shift (@_); 481 470 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 482 471 483 472 my $outhandle = $self->{'outhandle'}; 484 473 my $filename = &util::filename_cat($base_dir, $file); 485 474 486 475 my $cursection = $doc_obj->get_top_section(); 487 488 476 477 489 478 #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n"; 490 479 491 480 492 # (1) parse out the metadata of this record 493 my $metaname; 494 my $encoding; 495 my $title_meta; 496 497 my $line_index = 0; 498 my $text_start_index = -1; 499 my @lines = split(/(?:\r?\n)+/, $$textref); 500 501 foreach my $line (@lines) { 502 # first line is special and contains the URL (no metaname) 503 # and the inverted URL labelled with metaname "key" 504 if($line =~ m/^https?/ && $line =~ m/\s+key:\s+/) { 505 my @vals = split(/key:/, $line); 506 my $url = $vals[0]; 507 my $key = $vals[1]; 508 # trim whitespace https://perlmaven.com/trim 509 $url = &util::trim($url); #=~ s/^\s+|\s+$//g; 510 $key = &util::trim($key); #=~ s/^\s+|\s+$//g; 511 512 # if we have a keep_urls hash, then only process records of whitelisted urls 513 if(defined $self->{'keep_urls'} && !$self->{'keep_urls'}->{$url}) { 514 # URL not whitelisted, so stop processing this record 515 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): discarding record for URL not whitelisted: $url\n" 516 if $self->{'verbosity'} > 3; 517 return 0; 518 } else { 519 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): processing record of whitelisted URL $url...\n" 520 if $self->{'verbosity'} > 3; 521 } 522 $doc_obj->add_utf8_metadata ($cursection, "ex.srcURL", $url); 523 $doc_obj->add_utf8_metadata ($cursection, "ex.key", $key); 524 525 # # let's also set the domain from the URL, as that will make a 526 # # more informative bookshelf label than siteID 527 # my $domain = $url; 528 # # remove protocol:// and everything after and including subsequent slash 529 # $domain =~ s@^https?://([^/]+).*@$1@; 530 # #$domain =~ s@^https?://@@; # remove protocol 531 # #$domain =~ s@/.*$@@; # now remove everything after first slash 532 # my $protocol = $url;# =~ s@(^https?).*$@@; 533 # $protocol =~ s@(^https?).*$@$1@; 534 # $domain = $protocol."://".$domain; 535 # #$domain =~ s@[\.\-]@@g; 536 # #$domain = "pinky"; 537 # $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain); 538 539 540 # let's also set the domain from the URL, as that will make a 541 # more informative bookshelf label than siteID 542 # For complete domain, keep protocol:// and every non-slash after, 543 # without requiring presence of subsequent slash 544 # https://stackoverflow.com/questions/3652527/match-regex-and-assign-results-in-single-line-of-code 545 # Can clean up protocol and www. in bookshelf's remove_prefix option 546 547 my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@; 548 549 # For domain, the following removes protocol:// and 550 # everything after and including subsequent slash, without requiring subsequent slash 551 #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works 552 #my ($protocol, $basicdomain) = $url =~ m@(^https?)://([^/]+).*@; # Should work 553 #my $domain = $protocol."://".$basicdomain; 554 $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain); 555 $doc_obj->add_utf8_metadata ($cursection, "ex.basicDomain", $basicDomain); 556 481 # (1) parse out the metadata of this record 482 my $metaname; 483 my $encoding; 484 my $title_meta; 485 486 my $line_index = 0; 487 my $text_start_index = -1; 488 my @lines = split(/(?:\r?\n)+/, $$textref); 489 490 foreach my $line (@lines) { 491 # first line is special and contains the URL (no metaname) 492 # and the inverted URL labelled with metaname "key" 493 if($line =~ m/^https?/ && $line =~ m/\s+key:\s+/) { 494 my @vals = split(/key:/, $line); 495 # get url and key, and trim whitespace simultaneously 496 my $url = &util::trim($vals[0]); 497 my $key = &util::trim($vals[1]); 498 499 # if we have a keep_urls hash, then only process records of whitelisted urls 500 if(defined $self->{'keep_urls'} && !$self->{'keep_urls'}->{$url}) { 501 # URL not whitelisted, so stop processing this record 502 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): discarding record for URL not whitelisted: $url\n" 503 if $self->{'verbosity'} > 3; 504 return 0; 505 } else { 506 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): processing record of whitelisted URL $url...\n" 507 if $self->{'verbosity'} > 3; 508 } 509 $doc_obj->add_utf8_metadata ($cursection, "ex.srcURL", $url); 510 $doc_obj->add_utf8_metadata ($cursection, "ex.key", $key); 511 512 513 # let's also set the domain from the URL, as that will make a 514 # more informative bookshelf label than siteID 515 # For complete domain, keep protocol:// and every non-slash after. 516 # (This avoids requiring presence of subsequent slash) 517 # https://stackoverflow.com/questions/3652527/match-regex-and-assign-results-in-single-line-of-code 518 # Can clean up protocol and www. in List classifier's bookshelf's remove_prefix option 519 # or can build classifier on basicDomain instead. 520 521 my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@; 522 #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works 523 $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain); 524 $doc_obj->add_utf8_metadata ($cursection, "ex.basicDomain", $basicDomain); 525 526 } 527 # check for full text 528 elsif ($line =~ m/text:start:/) { 529 $text_start_index = $line_index; 530 last; # if we've reached the full text portion, we're past the metadata portion of this record 531 } 532 elsif($line =~ m/^[^:]+:.+$/) { # look for meta #elsif($line =~ m/^[^:]+:[^:]+$/) { # won't allow protocol://url in metavalue 533 my @metakeyvalues = split(/:/, $line); # split on first : 534 535 my $metaname = shift(@metakeyvalues); 536 my $metavalue = join("", @metakeyvalues); 537 538 # skip "metadata _rs_" and "metadata _csh_" as these contain illegible characters for values 539 if($metaname !~ m/metadata\s+_(rs|csh)_/) { 540 541 # trim whitespace 542 $metaname = &util::trim($metaname); 543 $metavalue = &util::trim($metavalue); 544 545 if($metaname eq "title") { # TODO: what to do about "title: null" cases? 546 ##print STDERR "@@@@ Found title: $metavalue\n"; 547 #$metaname = "Title"; # will set "title" as "Title" metadata instead 548 # TODO: treat title metadata specially by using character encoding to store correctly? 549 550 # Won't add Title metadata to docObj until after all meta is processed, 551 # when we'll know encoding and can process title meta 552 $title_meta = $metavalue; 553 $metavalue = ""; # will force ex.Title metadata to be added AFTER for loop 557 554 } 558 # check for full text 559 elsif ($line =~ m/text:start:/) { 560 $text_start_index = $line_index; 561 last; # if we've reached the full text portion, we're past the metadata portion of this record 555 elsif($metaname =~ m/CharEncodingForConversion/) { # TODO: or look for "OriginalCharEncoding"? 556 ##print STDERR "@@@@ Found encoding: $metavalue\n"; 557 $encoding = $metavalue; # TODO: should we use this to interpret the text and title in the correct encoding and convert to utf-8? 558 559 if($encoding eq "utf-8") { 560 $encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8" 561 } else { 562 print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8: $encoding\n"; 563 } 564 565 562 566 } 563 elsif($line =~ m/^[^:]+:.+$/) { # look for meta #elsif($line =~ m/^[^:]+:[^:]+$/) { # won't allow protocol://url in metavalue 564 my @metakeyvalues = split(/:/, $line); 565 #my $metaname = $metakeyvalues[0]; 566 #my $metavalue = $metakeyvalues[1]; 567 my $metaname = shift(@metakeyvalues); 568 my $metavalue = join("", @metakeyvalues); 569 570 # skip "metadata _rs_" and "metadata _csh_" as these contain illegible characters for values 571 if($metaname !~ m/metadata\s+_(rs|csh)_/) { 572 573 # trim whitespace 574 $metaname = &util::trim($metaname); #=~ s/^\s+|\s+$//g; 575 $metavalue = &util::trim($metavalue); #=~ s/^\s+|\s+$//g; 576 577 if($metaname eq "title") { # TODO: what to do about "title: null" cases? 578 ##print STDERR "@@@@ Found title: $metavalue\n"; 579 #$metaname = "Title"; # set this as ex.Title metadata 580 # TODO: treat title metadata specially by using character encoding to store correctly? 581 582 # won't add Title metadata to docObj until after all meta is processed, when we'll know encoding and can process title meta 583 $title_meta = $metavalue; 584 $metavalue = ""; 585 } 586 elsif($metaname =~ m/CharEncodingForConversion/) { # TODO: or look for "OriginalCharEncoding"? 587 ##print STDERR "@@@@ Found encoding: $metavalue\n"; 588 $encoding = $metavalue; # TODO: should we use this to interpret the text and title in the correct encoding and convert to utf-8? 589 590 if($encoding eq "utf-8") { 591 $encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8" 592 } else { 593 print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8: $encoding\n"; 594 } 595 596 597 } 598 599 # move occurrences of "marker " or "metadata " strings at start of metaname to end 600 #$metaname =~ s/^(marker|metadata)\s+(.*)$/$2$1/; 601 # remove "marker " or "metadata " strings from start of metaname 602 $metaname =~ s/^(marker|metadata)\s+//; 603 # remove underscores and all remaining spaces in metaname 604 $metaname =~ s/[ _]//g; 605 606 # add meta to docObject if both metaname and metavalue are non-empty strings 607 if($metaname ne "" && $metavalue ne "") { # && $metaname ne "rs" && $metaname ne "csh") { 608 $doc_obj->add_utf8_metadata ($cursection, "ex.".$metaname, $metavalue); 609 #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i; 610 } 611 612 } 613 } elsif ($line !~ m/^\s*$/) { # Not expecting any other type of non-empty line (or even empty lines) 614 print STDERR "NutchTextDump line not recognised as URL meta, other metadata or text content:\n\t$line\n"; 567 568 # move occurrences of "marker " or "metadata " strings at start of metaname to end 569 #$metaname =~ s/^(marker|metadata)\s+(.*)$/$2$1/; 570 # remove "marker " or "metadata " strings from start of metaname 571 $metaname =~ s/^(marker|metadata)\s+//; 572 # remove underscores and all remaining spaces in metaname 573 $metaname =~ s/[ _]//g; 574 575 # add meta to docObject if both metaname and metavalue are non-empty strings 576 if($metaname ne "" && $metavalue ne "") { # && $metaname ne "rs" && $metaname ne "csh") { 577 $doc_obj->add_utf8_metadata ($cursection, "ex.".$metaname, $metavalue); 578 #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i; 615 579 } 616 580 617 $line_index++; 581 } 582 } elsif ($line !~ m/^\s*$/) { # Not expecting any other type of non-empty line (or even empty lines) 583 print STDERR "NutchTextDump line not recognised as URL meta, other metadata or text content:\n\t$line\n"; 618 584 } 619 585 620 586 $line_index++; 587 } 588 589 621 590 # Add fileFormat as the metadata 622 591 $doc_obj->add_metadata($cursection, "FileFormat", "NutchDumpTxt"); 623 624 # Correct title metadata using encoding, if we have $encoding at last 625 # $title_meta = $self->to_utf8($encoding, $title_meta) if $encoding; 626 # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character 627 # Error message: "Perl Encode.pm cannot decode string with wide character" 628 # "That error message is saying that you have passed in a string that has already been decoded 629 # (and contains characters above codepoint 255). You can't decode it again." 630 if($title_meta && $title_meta ne "" && $title_meta ne "null") { 631 $title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding && $encoding ne "utf8"); 632 } else { # if we have "null" as title metadata, set it to the record URL? 633 #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL"); 634 #print STDERR "@@@@ null title to be replaced with ".$srcURLs->[0]."\n"; 635 #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0); 636 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param 637 if(defined $srcURL) { 638 print STDERR "@@@@ null/empty title to be replaced with ".$srcURL."\n" 639 if $self->{'verbosity'} > 3; 640 $title_meta = $srcURL; 641 } 592 593 # Correct title metadata using encoding, if we have $encoding at last 594 # $title_meta = $self->to_utf8($encoding, $title_meta) if $encoding; 595 # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character 596 # Error message: "Perl Encode.pm cannot decode string with wide character" 597 # "That error message is saying that you have passed in a string that has already been decoded 598 # (and contains characters above codepoint 255). You can't decode it again." 599 if($title_meta && $title_meta ne "" && $title_meta ne "null") { 600 $title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding); 601 } else { # if we have "null" as title metadata, set it to the record URL? 602 #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL"); 603 #print STDERR "@@@@ null title to be replaced with ".$srcURLs->[0]."\n"; 604 #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0); 605 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param 606 if(defined $srcURL) { 607 print STDERR "@@@@ null/empty title to be replaced with ".$srcURL."\n" 608 if $self->{'verbosity'} > 3; 609 $title_meta = $srcURL; 642 610 } 643 $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta); 644 645 646 611 } 612 $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta); 613 614 615 # When importOption OIDtype = dirname, the base_OID will be that dirname 616 # which was crafted to be the siteID. However, because our siteID is all numeric, 617 # a D gets prepended to create baseOID. Remove the starting 'D' to get actual siteID. 647 618 my $siteID = $self->get_base_OID($doc_obj); 648 619 #print STDERR "BASE OID: " . $self->get_base_OID($doc_obj) . "\n"; 649 # remove the 'D' that was inserted by a superclass in front of the all-numeric siteID to become baseOID:650 620 $siteID =~ s/^D//; 651 621 $doc_obj->add_utf8_metadata ($cursection, "ex.siteID", $siteID); 652 653 654 # (2) parse out text of this record 655 # if($text_start_index != -1 && pop(@lines) =~ m/text:end:/) { # we only have text content if there were "text:start:" and "text:end:" markers. 656 # # TODO: are we guaranteed popped line is text:end: and not empty/newline? 657 # @lines = splice(@lines,0,$text_start_index+1); # just keep every line AFTER text:start:, have already removed (popped) "text:end:" 658 659 # # glue together remaining lines, if there are any, into textref 660 # # https://stackoverflow.com/questions/7406807/find-size-of-an-array-in-perl 661 # if(scalar (@lines) > 0) { 662 # # TODO: do anything with $encoding to convert line to utf-8? 663 # foreach my $line (@lines) { 664 # $line = $self->to_utf8($encoding, $line) if $encoding; #if $encoding ne "utf-8"; 665 # $$textref .= $line."\n"; 666 # } 667 # } 668 # $$textref = "<pre>\n".$$textref."</pre>"; 669 # } else { 670 # print STDERR "WARNING: NutchTextDumpPlugin::process: had found a text start marker but not text end marker.\n"); 671 # $$textref = "<pre></pre>"; 672 # } 622 623 624 # (2) parse out text of this record 625 # if($text_start_index != -1 && pop(@lines) =~ m/text:end:/) { # we only have text content if there were "text:start:" and "text:end:" markers. 626 # # TODO: are we guaranteed popped line is text:end: and not empty/newline? 627 # @lines = splice(@lines,0,$text_start_index+1); # just keep every line AFTER text:start:, have already removed (popped) "text:end:" 673 628 674 my $no_text = 1; 675 if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record 676 if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) { 677 $$textref = $1; 678 if($$textref !~ m/^\s*$/) { 679 $$textref = $self->to_utf8($encoding, $$textref) if ($encoding && $encoding ne "utf8"); 680 $$textref = "<pre>\n".$$textref."\n</pre>"; 681 $no_text = 0; 682 } 683 } 629 # # glue together remaining lines, if there are any, into textref 630 # # https://stackoverflow.com/questions/7406807/find-size-of-an-array-in-perl 631 # if(scalar (@lines) > 0) { 632 # # TODO: do anything with $encoding to convert line to utf-8? 633 # foreach my $line (@lines) { 634 # $line = $self->to_utf8($encoding, $line) if $encoding; #if $encoding ne "utf-8"; 635 # $$textref .= $line."\n"; 636 # } 637 # } 638 # $$textref = "<pre>\n".$$textref."</pre>"; 639 # } else { 640 # print STDERR "WARNING: NutchTextDumpPlugin::process: had found a text start marker but not text end marker.\n"; 641 # $$textref = "<pre></pre>"; 642 # } 643 644 # (2) parse out text of this record 645 my $no_text = 1; 646 if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record 647 if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) { 648 $$textref = $1; 649 if($$textref !~ m/^\s*$/) { 650 $$textref = $self->to_utf8($encoding, $$textref) if ($encoding); 651 $$textref = "<pre>\n".$$textref."\n</pre>"; 652 $no_text = 0; 653 } 684 654 } 685 if($no_text) { 686 $$textref = "<pre></pre>"; 687 } 688 689 # Debugging 690 # To avoid "wide character in print" messages for debugging, set binmode of handle to utf8/encoding 691 # https://stackoverflow.com/questions/15210532/use-of-use-utf8-gives-me-wide-character-in-print 692 # if ($self->{'verbosity'} > 3) { 693 # if($encoding && $encoding eq "utf8") { 694 # binmode STDERR, ':utf8'; 695 # } 696 697 # print STDERR "TITLE: $title_meta\n"; 698 # print STDERR "ENCODING = $encoding\n" if $encoding; 699 # #print STDERR "---------------\nTEXT CONTENT\n---------\n", $$textref, "\n------------------------\n"; 700 # } 701 702 655 } 656 if($no_text) { 657 $$textref = "<pre></pre>"; 658 } 659 660 # Debugging 661 # To avoid "wide character in print" messages for debugging, set binmode of handle to utf8/encoding 662 # https://stackoverflow.com/questions/15210532/use-of-use-utf8-gives-me-wide-character-in-print 663 # if ($self->{'verbosity'} > 3) { 664 # if($encoding && $encoding eq "utf8") { 665 # binmode STDERR, ':utf8'; 666 # } 667 668 # print STDERR "TITLE: $title_meta\n"; 669 # print STDERR "ENCODING = $encoding\n" if $encoding; 670 # #print STDERR "---------------\nTEXT CONTENT\n---------\n", $$textref, "\n------------------------\n"; 671 # } 672 673 703 674 $doc_obj->add_utf8_text($cursection, $$textref); 704 675
Note:
See TracChangeset
for help on using the changeset viewer.