- Timestamp:
- 2020-06-05T19:51:38+12:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm
r34131 r34137 143 143 no strict 'refs'; # allow filehandles to be variables and viceversa 144 144 145 146 # Seems to be 147 # nohup command 148 # Not: nohup command > bla.txt 2&>1 & 149 # nor even: nohup command & 150 # nohup.out (possibly both STDERR and STDOUT, do a quick test first and then delete nohup.out before re-running) 151 # in the folder the command is run 152 # Delete nohup.out when re-running command. 153 # Tripped up and unhappy only when commands require keyboard input at any stage. 154 # 155 # 145 156 # TODO: 157 # Use "od" to print out bytevalues of the dump.txt file to check _rs_ and _csh_ 158 # Also google Nutch about what those fields mean. 159 # od -a 160 # every byte as ASCII character 161 # od -ab 162 # ASCII and bytevalue: 163 # First comes byteoffset and then ascii character (sp for space). Line underneath the numeric byte values in hex of the individual characters. 164 # 146 165 # + 1. Split each dump.txt file into its individual records as individual docs 147 166 # + 2. Store the meta of each individual record/doc 148 167 # ?3. Name each doc, siteID.docID else HASH internal text. See EmailPlugin? 149 # -In SplitTextFile::read(), why is $segment which counts discarded docs too used to add record ID168 # + In SplitTextFile::read(), why is $segment which counts discarded docs too used to add record ID 150 169 # rather than $count which only counts included docs? I am referring to code: 151 170 # $self->add_OID($doc_obj, $id, $segment); 171 # Because we get persistent URLs, regardless of whitelist urls file content! 152 172 # The way I've solved this is by setting the OIDtype importOption. Not sure if this is what was required. 153 173 # + 4. Keep a map of all URLs seen - whitelist URLs. … … 171 191 172 192 # CHECK: 173 # - title fallback is URL.193 # + title fallback is URL. Remove domain/all folder prefix (unless nothing remains), convert underscores and hyphens to spaces. 174 194 # + util::tidy_up_OID() prints warning. SiteID is foldername and OIDtype=dirname, so fully numeric 175 195 # siteID to OID conversion results in warning message that siteID is fully numeric and gets 'D' prefixed. 176 196 # Is this warning still necessary? 177 197 # - Ask about binmode usage (for debugging) in this file 198 178 199 179 200 # To get all the isMRI results, I ran Robo-3T against our mongodb as … … 479 500 # https://stackoverflow.com/questions/1348639/how-can-i-reinitialize-perls-stdin-stdout-stderr 480 501 # https://metacpan.org/pod/open::layers 502 # if() { # Google: "what is perl choosing to make the default char encoding for the file handle". Does it take a hint from somewhere, like env vars? Look for env vars 503 # # is there a perl env var to use, to check char enc? If set to utf-8, do this 481 504 #binmode(STDERR, ':utf8'); ## FOR DEBUGGING! To avoid "wide character in print" messages, but modifies globally for process! 505 #} 506 # Then move this if-block to BEGIN blocks of all perl process files. 482 507 483 508 #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n"; … … 609 634 } else { # if we have "null" as title metadata, set it to the record URL? 610 635 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL"); 611 my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; # use basicURL for title instead of srcURL, else many docs get classified under "Htt" bucket for https612 636 if(defined $srcURL) { 613 print STDERR "@@@@ null/empty title to be replaced with ".$basicURL."\n" 614 if $self->{'verbosity'} > 3; 615 $title_meta = $basicURL; 637 # Use the web page name without file ext for doc title, if web page name present, 638 # else use basicURL for title for title instead of srcURL, 639 # else many docs get classified under "Htt" bucket for https 640 641 my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; 642 my ($pageName) = $basicURL =~ m@([^/]+)$@; 643 if (!$pageName) { 644 $pageName = $basicURL; 645 } else { 646 # remove any file extension 647 $pageName =~ s@\.[^\.]+@@; 648 # replace _ and - with spaces 649 $pageName =~ s@[_\-]@ @g; 650 } 651 652 print STDERR "@@@@ null/empty title for $basicURL to be replaced with: $pageName\n" 653 if $self->{'verbosity'} > 3; 654 $title_meta = $pageName; 616 655 } 617 656 } 657 618 658 $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta); 619 659
Note:
See TracChangeset
for help on using the changeset viewer.