Ignore:
Timestamp:
2020-06-05T19:51:38+12:00 (4 years ago)
Author:
ak19
Message:

Have only been able to incorporate one of Dr Bainbridge's improvements so far: when there's no title meta, the first title fallback is not basicURL but web page name without file extension, e.g. domain.com/path/my-web-page.html will have the title 'my web page'. Only if that works out to be the empty string, do we resort to basicURL again for title.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm

    r34131 r34137  
    143143no strict 'refs'; # allow filehandles to be variables and viceversa
    144144
     145
     146# Seems to be
     147# nohup command
     148# Not: nohup command > bla.txt 2&>1 &
     149# nor even: nohup command &
     150#    nohup.out (possibly both STDERR and STDOUT, do a quick test first and then delete nohup.out before re-running)
     151#    in the folder the command is run
     152# Delete nohup.out when re-running command.
     153# Tripped up and unhappy only when commands require keyboard input at any stage.
     154#
     155#
    145156# TODO:
     157# Use "od" to print out bytevalues of the dump.txt file to check _rs_ and _csh_
     158# Also google Nutch about what those fields mean.
     159# od -a
     160# every byte as ASCII character
     161# od -ab
     162# ASCII and bytevalue:
     163# First comes byteoffset and then ascii character (sp for space). Line underneath the numeric byte values in hex of the individual characters.
     164#
    146165# + 1. Split each dump.txt file into its individual records as individual docs
    147166# + 2. Store the meta of each individual record/doc
    148167# ?3. Name each doc, siteID.docID else HASH internal text. See EmailPlugin?
    149 # - In SplitTextFile::read(), why is $segment which counts discarded docs too used to add record ID
     168# + In SplitTextFile::read(), why is $segment which counts discarded docs too used to add record ID
    150169# rather than $count which only counts included docs? I am referring to code:
    151170#   $self->add_OID($doc_obj, $id, $segment);
     171# Because we get persistent URLs, regardless of whitelist urls file content!
    152172# The way I've solved this is by setting the OIDtype importOption. Not sure if this is what was required.
    153173# + 4. Keep a map of all URLs seen - whitelist URLs.
     
    171191
    172192# CHECK:
    173 # - title fallback is URL.
     193# + title fallback is URL. Remove domain/all folder prefix (unless nothing remains), convert underscores and hyphens to spaces.
    174194# + util::tidy_up_OID() prints warning. SiteID is foldername and OIDtype=dirname, so fully numeric
    175195# siteID to OID conversion results in warning message that siteID is fully numeric and gets 'D' prefixed.
    176196# Is this warning still necessary?
    177197# - Ask about binmode usage (for debugging) in this file
     198
    178199
    179200# To get all the isMRI results, I ran Robo-3T against our mongodb as
     
    479500    # https://stackoverflow.com/questions/1348639/how-can-i-reinitialize-perls-stdin-stdout-stderr
    480501    # https://metacpan.org/pod/open::layers
     502    # if() { # Google: "what is perl choosing to make the default char encoding for the file handle". Does it take a hint from somewhere, like env vars? Look for env vars
     503    #  # is there a perl env var to use, to check char enc? If set to utf-8, do this
    481504    #binmode(STDERR, ':utf8'); ## FOR DEBUGGING! To avoid "wide character in print" messages, but modifies globally for process!
     505    #}
     506    # Then move this if-block to BEGIN blocks of all perl process files.
    482507   
    483508    #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n";
     
    609634    } else { # if we have "null" as title metadata, set it to the record URL?
    610635    my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL");
    611     my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; # use basicURL for title instead of srcURL, else many docs get classified under "Htt" bucket for https
    612636    if(defined $srcURL) {
    613         print STDERR "@@@@ null/empty title to be replaced with ".$basicURL."\n"
    614         if $self->{'verbosity'} > 3;
    615         $title_meta = $basicURL;
     637        # Use the web page name without file ext for doc title, if web page name present,
     638        # else use basicURL for title for title instead of srcURL,
     639        # else many docs get classified under "Htt" bucket for https
     640       
     641        my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@;
     642        my ($pageName) = $basicURL =~ m@([^/]+)$@;
     643        if (!$pageName) {       
     644            $pageName = $basicURL;
     645        } else {
     646            # remove any file extension
     647            $pageName =~ s@\.[^\.]+@@;
     648        # replace _ and - with spaces
     649        $pageName =~ s@[_\-]@ @g;
     650        }
     651       
     652        print STDERR "@@@@ null/empty title for $basicURL to be replaced with: $pageName\n"
     653            if $self->{'verbosity'} > 3;
     654        $title_meta = $pageName;
    616655    }
    617656    }
     657   
    618658    $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta);
    619659   
Note: See TracChangeset for help on using the changeset viewer.