Changeset 34129
- Timestamp:
- 2020-05-30T01:01:01+12:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm
r34126 r34129 161 161 # 162 162 # CLEANUP: 163 # Remove MetadataRead functions and inheritance163 # + Remove MetadataRead functions and inheritance 164 164 # 165 165 # QUESTIONS: 166 166 # - encoding = utf-8, changed to "utf8" as required by copied to_utf8(str) method. Why does it not convert 167 167 # the string parameter but fails in decode() step? Is it because the string is already in UTF8? 168 # - Problem converting text with encoding in full set of nutch dump.txt when there encoding is windows-1252 .168 # - Problem converting text with encoding in full set of nutch dump.txt when there encoding is windows-1252 and Shift-JIS. 169 169 # - TODOs 170 170 # 171 # - Should I add metadata as "ex."+meta or as meta? e.g. ex.srcURL or srcURL? 172 # - Want to read in keep_urls_file, maintaining a hashmap of its URLs, only on import, isn't that correct? 173 # Then how can I initialise this only once and only during import? constructor and init() methods are called during buildcol too. 174 # For now, I've done it in can_proc_this_file() but there must be a more appropriate place and correct way to do this? 175 # - why can't I do doc_obj->get_meta_element($section, "ex.srcURL") but have to pass "srcURL" and 1 to ignore 176 # namespace? 177 # - in collectionConfig file I have to leave out ex. prefix for all but Title, why? 178 # - in GLI, browsing classifier sort_leaf options, "ex.srcURL" appears only as "ex.srcurl" (lowercased). Why? 179 # - On the other hand, in GLI's search indexes, both ex.srcurl and ex.srcURL appear. But only building 180 # with an index on ex.srcURL provides a search option in the search box. But then searching on an existing 181 # srcURL produces 0 results anyway. 182 # - Is this all because I am naming my ex metadata names wrongly? e.g. ex.srcURL, ex.siteID, ex.srcDomain. 183 # 171 184 172 # CHECK: 185 173 # - title fallback is URL. 186 # -util::tidy_up_OID() prints warning. SiteID is foldername and OIDtype=dirname, so fully numeric174 # + util::tidy_up_OID() prints warning. SiteID is foldername and OIDtype=dirname, so fully numeric 187 175 # siteID to OID conversion results in warning message that siteID is fully numeric and gets 'D' prefixed. 188 176 # Is this warning still necessary? 189 177 # - Ask about binmode usage (for debugging) in this file 190 178 191 179 # To get all the isMRI results, I ran Robo-3T against our mongodb as … … 210 198 # into our collection. 211 199 # Remember to configure the NutchTextDumpPlugin with option "keep_urls_file" = isMRI_urls.txt to make use of this. 200 # 201 # + ex meta -> don't add with ex. prefix 202 # + check for and call to setup_keep_urls(): move into process() rather than doing this in more convoluted way in can_process_this_file() 203 # + util::tidy_up_oid() -> print callstack to find why it's called on every segment 204 # X- binmode STDERR: work out what default mode on STDERR is and reset to that after printing debug messages in utf8 binmode 205 # - test collection to check various encodings with and without to_utf8() function - tested collection 00436 in collection cctest3. 206 # The srcURL .../divrey/shaar.htm (Identifier: D00436s184) is in Hebrew and described as being in char encoding iso-8859-8. 207 # But when I paste the build output when using NutchTextDumpPlugin.pm_debug_iso-8859-8 208 # into emacs, the text for this record reads and scrolls R to L in emacs. 209 # When previewing the text in the full text section in GS3, it reads L to R. 210 # The digits used in the text seem to match, occurring in reverse order from each other between emacs and GS3 preview. 211 # Building displays error messages if to_utf8() called to decode this record's title meta or full text 212 # using the discovered encoding. 212 213 213 214 sub BEGIN { … … 261 262 #return bless $self, $class; 262 263 $self = bless $self, $class; 263 264 # Can only call any methods on $self AFTER the bless operation above 265 #$self->setup_keep_urls(); # want to set up the keep_urls hashmap only once, so have to do it here (init is also called by buildcol) 266 264 # Can only call any $self->method(); AFTER the bless operation above, so from this point onward 267 265 return $self; 268 266 } 269 267 270 # sub init {271 # my $self = shift (@_);272 # my ($verbosity, $outhandle, $failhandle) = @_;273 274 # if(!$self->{'keep_urls_file'}) {275 # my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" .276 # " No records will be filtered.\n";277 # print $outhandle $msg if ($verbosity > 2);278 279 # $self->SUPER::init(@_);280 # return;281 # }282 283 # # read in the keep urls files284 # my $keep_urls_file = &util::locate_config_file($self->{'keep_urls_file'});285 # if (!defined $keep_urls_file)286 # {287 # my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" .288 # " No records will be filtered.\n";289 290 # print $outhandle $msg;291 292 # $self->{'keep_urls'} = undef;293 # # Not an error if there's no $keep_urls_file: it just means all records294 # # in dump.txt will be processed.295 # }296 # else {297 # #$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle);298 # #$self->{'keep_urls'} = {};299 # $self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle);300 # }301 302 ## if($self->{'keep_urls'} && $verbosity > 2) {303 # # print STDERR "@@@@ keep_urls hash map contains:\n";304 # # map { print STDERR $_."=>".$self->{'keep_urls'}->{$_}."\n"; } keys %{$self->{'keep_urls'}};305 ## }306 # $self->SUPER::init(@_);307 # }308 268 309 269 sub setup_keep_urls { … … 316 276 $self->{'keep_urls_processed'} = 1; # flag to track whether this method has been called already during import 317 277 318 #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls() \n";278 #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls() - this method should only be called once and only during import.pl\n"; 319 279 320 280 if(!$self->{'keep_urls_file'}) { … … 352 312 } 353 313 354 # TODO: This is an ugly way to do this anda non-intuitive place to do this. Is there a better way?355 # Overriding can_process_this_file() in order to avoid setting up the keep_urls hashmap during356 # buildcol.pl. We only want to setup the hash during import.357 # During buildcol, this method is called with directories and not files and this method will return358 # false as a result. So when it returns true, it will be import.pl, and we check whether we haven't359 # already setup the keep_urls map. If the keep urls file has not yet been processed, then we set up360 # the hashmap once.361 sub can_process_this_file {362 my $self = shift(@_);363 my ($filename) = @_;364 my $can_process_return_val = $self->SUPER::can_process_this_file(@_);365 366 # We want to load in the keep_urls_file and create the keep_urls hashmap only once, during import367 # Because the keep urls file can be large and it and the hashmap serve no purpose during buildcol.pl.368 # Check whether we've already processed the file/built the hashmap, as we don't want to do this369 # more than 1 time even within just the import cycle.370 if($can_process_return_val && !$self->{'keep_urls_processed'}) { #!defined $self->{'keep_urls'}) {371 $self->setup_keep_urls();372 }373 374 return $can_process_return_val;375 376 }377 314 378 315 sub parse_keep_urls_file { … … 508 445 my $self = shift (@_); 509 446 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; 447 448 # Only load the urls from the keep_urls_file into a hash if we've not done so before. 449 # Although this method is called on each dump.txt file found, and we want to only setup_keep_urls() 450 # once for a collection and only during import and not buildcol, it's best to do the check and setup_keep_urls() 451 # call here, because this subroutine, process(), is only called during import() and not during buildcol. 452 # During buildcol, can_process_this_file() is not called on dump.txt files but on folders (archives folder). 453 # Only if this plugin's called on can_process_this_file() is called on a dump.txt, will this process() be called 454 # on each segment of the dump.txt file 455 # So this is the best spot to ensure we've setup_keep_urls() here, if we haven't already: 456 457 if(!$self->{'keep_urls_processed'}) { 458 $self->setup_keep_urls(); 459 } 460 510 461 511 462 my $outhandle = $self->{'outhandle'}; 512 463 my $filename = &util::filename_cat($base_dir, $file); 513 464 465 514 466 my $cursection = $doc_obj->get_top_section(); 515 467 468 # https://perldoc.perl.org/functions/binmode.html 469 # "To mark FILEHANDLE as UTF-8, use :utf8 or :encoding(UTF-8) . :utf8 just marks the data as UTF-8 without further checking, 470 # while :encoding(UTF-8) checks the data for actually being valid UTF-8. More details can be found in PerlIO::encoding." 516 471 # https://stackoverflow.com/questions/27801561/turn-off-binmodestdout-utf8-locally 517 #binmode STDERR, ':utf8'; ## FOR DEBUGGING! To avoid "wide character in print" messages 472 # Is there anything useful here: 473 # https://perldoc.perl.org/PerlIO/encoding.html and https://stackoverflow.com/questions/21452621/binmode-encoding-handling-malformed-data 474 # https://stackoverflow.com/questions/1348639/how-can-i-reinitialize-perls-stdin-stdout-stderr 475 # https://metacpan.org/pod/open::layers 476 #binmode(STDERR, ':utf8'); ## FOR DEBUGGING! To avoid "wide character in print" messages, but modifies globally for process! 518 477 519 478 #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n"; … … 550 509 if $self->{'verbosity'} > 3; 551 510 } 552 $doc_obj->add_utf8_metadata ($cursection, " ex.srcURL", $url);553 $doc_obj->add_utf8_metadata ($cursection, " ex.key", $key);511 $doc_obj->add_utf8_metadata ($cursection, "srcURL", $url); 512 $doc_obj->add_utf8_metadata ($cursection, "key", $key); 554 513 555 514 … … 564 523 my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@; 565 524 #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works 566 $doc_obj->add_utf8_metadata ($cursection, " ex.srcDomain", $domain);567 $doc_obj->add_utf8_metadata ($cursection, " ex.basicDomain", $basicDomain);525 $doc_obj->add_utf8_metadata ($cursection, "srcDomain", $domain); 526 $doc_obj->add_utf8_metadata ($cursection, "basicDomain", $basicDomain); 568 527 569 528 } … … 603 562 $encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8" 604 563 } else { 605 print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8: $encoding\n";606 }607 564 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL"); 565 print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8 but $encoding\n\tfor record: $srcURL\n"; 566 } 608 567 609 568 } … … 618 577 # add meta to docObject if both metaname and metavalue are non-empty strings 619 578 if($metaname ne "" && $metavalue ne "") { # && $metaname ne "rs" && $metaname ne "csh") { 620 $doc_obj->add_utf8_metadata ($cursection, "ex.".$metaname, $metavalue); 579 # when no namespace is provided as here, adds as ex. meta. 580 # Don't explicitly prefix ex., as things becomes convoluted when retrieving meta 581 $doc_obj->add_utf8_metadata ($cursection, $metaname, $metavalue); 621 582 #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i; 622 583 } … … 635 596 636 597 # Correct title metadata using encoding, if we have $encoding at last 637 # $title_meta = $self->to_utf8($encoding, $title_meta) if $encoding;638 598 # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character 639 599 # Error message: "Perl Encode.pm cannot decode string with wide character" … … 643 603 #$title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding); 644 604 } else { # if we have "null" as title metadata, set it to the record URL? 645 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL" , 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param605 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL"); 646 606 my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; # use basicURL for title instead of srcURL, else many docs get classified under "Htt" bucket for https 647 607 if(defined $srcURL) { … … 657 617 # which was crafted to be the siteID. However, because our siteID is all numeric, 658 618 # a D gets prepended to create baseOID. Remove the starting 'D' to get actual siteID. 659 my $siteID = $self->get_ base_OID($doc_obj);660 #print STDERR "BASE OID: " . $s elf->get_base_OID($doc_obj). "\n";619 my $siteID = $self->get_siteID($doc_obj, $file); 620 #print STDERR "BASE OID: " . $siteID . "\n"; 661 621 $siteID =~ s/^D//; 662 $doc_obj->add_utf8_metadata ($cursection, " ex.siteID", $siteID);622 $doc_obj->add_utf8_metadata ($cursection, "siteID", $siteID); 663 623 664 624 … … 686 646 my $no_text = 1; 687 647 if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record 648 688 649 if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) { 689 650 $$textref = $1; … … 718 679 } 719 680 720 681 sub get_siteID { 682 my $self = shift(@_); 683 my ($doc_obj, $file) = @_; 684 685 my $siteID; 686 if ($file =~ /(\d+).txt/) { 687 # file name without extension is site ID, e.g. 00001.txt 688 $siteID = $1; 689 #$siteID = $file; 690 #$siteID =~ s@\.txt$@@; 691 } 692 else { # if($doc_obj->{'OIDtype'} eq "dirname") or even otherwise, just use baseOID 693 # baseOID is the same as site ID when OIDtype is configured to dirname because docs are stored as 00001/dump.txt 694 # siteID has no real meaning in other cases 695 $siteID = $self->{'dirname_siteID'} || $self->get_base_OID($doc_obj); 696 697 } 698 if(!$self->{'siteID'} || $siteID ne $self->{'siteID'}) { 699 $self->{'siteID'} = $siteID; 700 } 701 return $self->{'siteID'}; 702 } 703 704 705 # SplitTextFile::get_base_OID() has the side-effect of calling SUPER::add_OID() 706 # inorder to initialise it. This then ultimately results in calling util::tidy_up_OID() to print warning messages 707 # about all-numeric baseOID requiring the D prefix prepended. 708 # When the base_OID is already set and we want to get the baseOID without that side-effect, because siteID = baseOID 709 # in cases where OIDtype=dirname. 710 # We don't want to recalculate baseOID for each segment, only once per dump.txt file as the superclass SplitTextFile 711 # did it. However, we need access to the baseOID from this plugin 712 # So we override this method to store the calculated baseOID in a variable for use and check if it's set before 713 # calling this method. 714 # CANNOT override this method in the usual way though: to calculate baseOID once per dump.txt, store it and return 715 # the stored value for each segment because the superclass version of get_base_OID has a side-effect and needs to 716 # continue doing everything it usually does each time the superclass calls this method. 717 sub get_base_OID { 718 my $self = shift(@_); 719 my ($doc_obj) = @_; 720 721 722 # Let this method do what it always did, as it does more than return a value and has important side-effects! 723 # SplitTextPlugin calls this method once for every segment, not just for the base document, with the side-effect 724 # of calculating and adding the OID for each segment. 725 # Therefore, do not return the stored dirname_siteID if already set, as otherwise this method will have 726 # the ominous side-effect of "Warning: D00001s1 already exists with index status I" messages for every segment! 727 # Instead, when trying to work out $siteID (when OIDtype=dirname), check if $self->{'dirname_siteID'} already set 728 # and use that else call this method. 729 #if(!defined $self->{'dirname_siteID'}) { 730 $self->{'dirname_siteID'} = $self->SUPER::get_base_OID($doc_obj); # store for NutchTextDumpPlugin's internal use 731 #} 732 return $self->{'dirname_siteID'}; # return superclass return value as always 733 } 721 734 1;
Note:
See TracChangeset
for help on using the changeset viewer.