Changeset 34122 for main


Ignore:
Timestamp:
2020-05-26T01:13:33+12:00 (4 years ago)
Author:
ak19
Message:
  1. After some testing of building the complete commoncrawl collection, noticed warnings about windows-1252 set by nutch as charset encoding. Attempting to use latin-1 for windows-1252 encodings also in to_utf8(), to decode text in such cases. 2. And when encoding is utf8 (set by nutch as utf8), uncommenting the immediate return statement in the to_utf8() function to take away if(not utf8) conditions that call the function. 3. Tidying up. 4. Tabbed lines in emacs after earlier occasional work on Windows.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm

    r34121 r34122  
    3535# both sorted by ex.srcURL, and an ex.Title classifier.
    3636# For the ex.srcDomain classifier, set removeprefix to: https?\:\/\/(www\.)?
     37# An alternative is to build that List classifier on ex.basicDomain instead of ex.srcDomain.
    3738# Finally, in the "display" format statement, add the following before the "wrappedSectionText" to
    3839# display the most relevant metadata of each record:
     
    7778  #   </div>
    7879
    79 # TODO: remove illegible values for metadata _rs_ and _csh_ in the example below before
     80# + DONE: remove illegible values for metadata _rs_ and _csh_ in the example below before
    8081# committing, in case their encoding affects the loading/reading in of this perl file.
    8182#
     
    102103    # metadata CharEncodingForConversion :  utf-8
    103104    # metadata OriginalCharEncoding :   utf-8
    104     # metadata _rs_ :     ï¿œ
    105     # metadata _csh_ :     
     105    # metadata _rs_ :
     106    # metadata _csh_ :
    106107    # text:start:
    107108    # Te Kura Kaupapa Māori o Te Whānau Tahi He mihi He mihi Te Kaupapa Ngā Tāngata Te Kākano Te Pihinga Te Tipuranga Te Puāwaitanga Te Tari Te Poari Matua Whakapā mai He mihi He mihi Te Kaupapa Ngā Tāngata Te Kākano Te Pihinga Te Tipuranga Te Puāwaitanga Te Tari Te Poari Matua Whakapā mai TE KURA KAUPAPA MĀORI O TE WHĀNAU TAHI He mihi Kei te mōteatea tonu nei ngā mahara ki te huhua kua mene atu ki te pō, te pōuriuri, te pōtangotango, te pō oti atu rā. Kua rite te wāhanga ki a rātou, hoki mai ki te ao tÅ«roa nei Ko Io Matua Kore te pÅ«taketanga, te pÅ«kaea, te pÅ«tātara ka rangona whānuitia e te ao. Ko tāna ko ngā whetÅ«, te marama, te haeata ki a Tamanui te rā. He atua i whakateretere mai ai ngā waka i tawhiti nui, i tawhiti roa, i tawhiti mai rā anō. Kei nga ihorei, kei ngā wahapÅ«, kei ngā pukumahara, kei ngā kanohi kai mātārae o tō tātou nei kura Aho Matua, Te Kura Kaupapa Māori o Te Whanau Tahi. Anei rā te maioha ki a koutou katoa e pÅ«mau tonu ki ngā wawata me ngā whakakitenga i whakatakotoria e ngā poupou i te wā i a rātou. Ka whakanuia hoki te toru tekau tau o tēnei kura mai i tōna orokohanga timatanga tae noa ki tēnei wā Ka pÅ«mau tōnu mātou ki te whakatauki o te kura e mea ana “Poipoia ō tātou nei pÅ«manawa” Takiritia tonutia te ra ki runga i Te Kura Kaupapa Maori o Te Whanau Tahi . Back to Top " Poipoia ō tātou nei pÅ«manawa -  Making our potential a reality "   ©  Te Kura Kaupapa Māori o Te Whānau Tahi, 2019  Cart ( 0 )
     
    157158# - encoding = utf-8, changed to "utf8" as required by copied to_utf8(str) method. Why does it not convert
    158159# the string parameter but fails in decode() step? Is it because the string is already in UTF8?
     160# - Problem converting text with encoding in full set of nutch dump.txt when there encoding is windows-1252.
     161# - TODOs
     162#
    159163# - Should I add metadata as "ex."+meta or as meta? e.g. ex.srcURL or srcURL?
    160164# - Want to read in keep_urls_file, maintaining a hashmap of its URLs, only on import, isn't that correct?
    161165# Then how can I initialise this only once and only during import? constructor and init() methods are called during buildcol too.
    162166# For now, I've done it in can_proc_this_file() but there must be a more appropriate place and correct way to do this?
    163 # - TODOs
    164167# - why can't I do doc_obj->get_meta_element($section, "ex.srcURL") but have to pass "srcURL" and 1 to ignore
    165168# namespace?
     
    177180# Is this warning still necessary?
    178181
    179 # methods defined in superclasses that have the same signature take
    180 # precedence in the order given in the ISA list. We want MetaPlugins to
    181 # call MetadataRead's can_process_this_file_for_metadata(), rather than
    182 # calling BaseImporter's version of the same method, so list inherited
    183 # superclasses in this order.
     182
    184183sub BEGIN {
    185184    @NutchTextDumpPlugin::ISA = ('SplitTextFile');
     
    216215    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
    217216    push(@$pluginlist, $class);
    218 
     217   
    219218    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
    220219    push(@{$hashArgOptLists->{"OptList"}},$options);
    221220 
    222221    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
    223 
     222   
    224223    if ($self->{'info_only'}) {
    225224    # don't worry about the options
    226225    return bless $self, $class;
    227226    }   
    228 
     227   
    229228    $self->{'keep_urls_processed'} = 0;
    230229    $self->{'keep_urls'} = undef;   
    231     $self->{'type'} = ""; # TODO: value can be 'ascii' or other. Used in MARCPlugin.pm. Keep this field here?
    232230   
    233231    #return bless $self, $class;
    234232    $self = bless $self, $class;
    235    
     233   
    236234    # Can only call any methods on $self AFTER the bless operation above
    237235    #$self->setup_keep_urls(); # want to set up the keep_urls hashmap only once, so have to do it here (init is also called by buildcol)
    238    
     236   
    239237    return $self;
    240238}
     
    282280    my $self = shift (@_);
    283281   
    284     my $verbosity = $self->{'verbosity'};
    285     my $outhandle = $self->{'outhandle'};
    286     my $failhandle = $self->{'failhandle'};
    287 
    288     $self->{'keep_urls_processed'} = 1; # flag to track whether this method has been called already during import
    289 
    290     #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls()\n";
     282    my $verbosity = $self->{'verbosity'};
     283    my $outhandle = $self->{'outhandle'};
     284    my $failhandle = $self->{'failhandle'};
     285   
     286    $self->{'keep_urls_processed'} = 1; # flag to track whether this method has been called already during import
     287   
     288    #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls()\n";
    291289   
    292     if(!$self->{'keep_urls_file'}) {
    293         my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" .
    294             "    No records will be filtered.\n";
    295         print $outhandle $msg if ($verbosity > 2);
    296        
    297         return;
    298     }
    299    
     290    if(!$self->{'keep_urls_file'}) {
     291    my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" .
     292        "    No records will be filtered.\n";
     293    print $outhandle $msg if ($verbosity > 2);
     294       
     295    return;
     296    }
     297   
    300298    # read in the keep urls files
    301299    my $keep_urls_file = &util::locate_config_file($self->{'keep_urls_file'});
    302300    if (!defined $keep_urls_file)
    303301    {
    304         my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" .
    305             "    No records will be filtered.\n";
    306 
    307         print $outhandle $msg;
    308        
    309         $self->{'keep_urls'} = undef;
    310         # TODO: Not a fatal error if $keep_urls_file can't be found: it just means all records
    311         # in dump.txt will be processed?
     302    my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" .
     303        "    No records will be filtered.\n";
     304   
     305    print $outhandle $msg;
     306   
     307    $self->{'keep_urls'} = undef;
     308    # TODO: Not a fatal error if $keep_urls_file can't be found: it just means all records
     309    # in dump.txt will be processed?
    312310    }
    313311    else { 
    314         #$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle);
    315         #$self->{'keep_urls'} = {};
    316         $self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle);
    317     }
    318 
     312    #$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle);
     313    #$self->{'keep_urls'} = {};
     314    $self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle);
     315    }
     316   
    319317    #if(defined $self->{'keep_urls'}) {
    320318    #   print STDERR "@@@@ keep_urls hash map contains:\n";
    321319    #   map { print STDERR $_."=>".$self->{'keep_urls'}->{$_}."\n"; } keys %{$self->{'keep_urls'}};
    322320    #}
    323 
     321   
    324322}
    325323
     
    334332    my $self = shift(@_);
    335333    my ($filename) = @_;
    336     my $can_process_return_val = $self->SUPER::can_process_this_file(@_);
    337    
    338     # We want to load in the keep_urls_file and create the keep_urls hashmap only once, during import
    339     # Because the keep urls file can be large and it and the hashmap serve no purpose during buildcol.pl.
    340     # Check whether we've already processed the file/built the hashmap, as we don't want to do this
    341     # more than 1 time even within just the import cycle.
    342     if($can_process_return_val && !$self->{'keep_urls_processed'}) { #!defined $self->{'keep_urls'}) {
    343         $self->setup_keep_urls();
    344     }
    345    
    346     return $can_process_return_val;
     334    my $can_process_return_val = $self->SUPER::can_process_this_file(@_);
     335   
     336    # We want to load in the keep_urls_file and create the keep_urls hashmap only once, during import
     337    # Because the keep urls file can be large and it and the hashmap serve no purpose during buildcol.pl.
     338    # Check whether we've already processed the file/built the hashmap, as we don't want to do this
     339    # more than 1 time even within just the import cycle.
     340    if($can_process_return_val && !$self->{'keep_urls_processed'}) { #!defined $self->{'keep_urls'}) {
     341    $self->setup_keep_urls();
     342    }
     343   
     344    return $can_process_return_val;
    347345   
    348346}
     
    350348sub parse_keep_urls_file {
    351349    my $self = shift (@_);
    352     my ($urls_file, $outhandle, $failhandle) = @_;
     350    my ($urls_file, $outhandle, $failhandle) = @_;
     351   
     352    # https://www.caveofprogramming.com/perl-tutorial/perl-hashes-a-guide-to-associative-arrays-in-perl.html
     353    # https://stackoverflow.com/questions/1817394/whats-the-difference-between-a-hash-and-hash-reference-in-perl
     354    $self->{'keep_urls'} = {}; # hash reference init to {}
    353355   
    354     # https://www.caveofprogramming.com/perl-tutorial/perl-hashes-a-guide-to-associative-arrays-in-perl.html
    355     # https://stackoverflow.com/questions/1817394/whats-the-difference-between-a-hash-and-hash-reference-in-perl
    356     #my %urls_map = (); # hash init to ()
    357     $self->{'keep_urls'} = {}; # hash reference init to {}
    358    
    359     # What if it is a very long file of URLs? Need to read a line at a time!
    360     #my $contents = &FileUtils::readUTF8File($urls_file); # could just call $self->read_file() inherited from SplitTextFile's parent ReadTextFile
    361     #my @lines = split(/(?:\r?\n)+/, $$textref);
    362    
    363     # Open the file in UTF-8 mode https://stackoverflow.com/questions/2220717/perl-read-file-with-encoding-method
    364     # and read in line by line into map
    365     my $fh;
    366     if (open($fh,'<:encoding(UTF-8)', $urls_file)) {
    367         while (defined (my $line = <$fh>)) {           
    368             $line = &util::trim($line); #$line =~ s/^\s+|\s+$//g; # trim whitespace
    369             if($line =~ m@^https?://@) { # add only URLs
    370                 #%urls_map{$line} = 1; # add the url to our perl hash
    371                 $self->{'keep_urls'}->{$line} = 1;
    372             }
    373         }
    374         close $fh; 
    375     } else {
    376         my $msg = "NutchTextDumpPlugin ERROR: Unable to open file keep_urls_file: \"" .
    377             $self->{'keep_urls_file'} . "\".\n " .
    378             "    No records will be filtered.\n";
    379         print $outhandle $msg;
    380         print $failhandle $msg;
    381         # Not fatal. TODO: should it be fatal when it can still process all URLs just because
    382         # it can't find the specified keep-urls.txt file?
     356    # What if it is a very long file of URLs? Need to read a line at a time!
     357    #my $contents = &FileUtils::readUTF8File($urls_file); # could just call $self->read_file() inherited from SplitTextFile's parent ReadTextFile
     358    #my @lines = split(/(?:\r?\n)+/, $$textref);
     359   
     360    # Open the file in UTF-8 mode https://stackoverflow.com/questions/2220717/perl-read-file-with-encoding-method
     361    # and read in line by line into map
     362    my $fh;
     363    if (open($fh,'<:encoding(UTF-8)', $urls_file)) {
     364    while (defined (my $line = <$fh>)) {           
     365        $line = &util::trim($line); #$line =~ s/^\s+|\s+$//g; # trim whitespace
     366        if($line =~ m@^https?://@) { # add only URLs       
     367        $self->{'keep_urls'}->{$line} = 1; # add the url to our perl hash
     368        }
    383369    }
    384    
    385     # if keep_urls hash is empty, ensure it is undefined from this point onward
    386     # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl
    387     my %urls_map = $self->{'keep_urls'};
    388     if(!keys %urls_map) {
    389         $self->{'keep_urls'} = undef;
    390     }
    391    
    392     #return %urls_map;
    393 }
    394 
     370    close $fh; 
     371    } else {
     372    my $msg = "NutchTextDumpPlugin ERROR: Unable to open file keep_urls_file: \"" .
     373        $self->{'keep_urls_file'} . "\".\n " .
     374        "    No records will be filtered.\n";
     375    print $outhandle $msg;
     376    print $failhandle $msg;
     377    # Not fatal. TODO: should it be fatal when it can still process all URLs just because
     378    # it can't find the specified keep-urls.txt file?
     379    }
     380   
     381    # if keep_urls hash is empty, ensure it is undefined from this point onward
     382    # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl
     383    my %urls_map = $self->{'keep_urls'};
     384    if(!keys %urls_map) {
     385    $self->{'keep_urls'} = undef;
     386    }
     387   
     388}
     389
     390# Accept "dump.txt" files (which are in numeric siteID folders),
     391# and txt files with numeric siteID, e.g. "01441.txt"
     392# if I preprocessed dump.txt files by renaming them this way.
    395393sub get_default_process_exp {
    396394    my $self = shift (@_);
    397 
     395   
    398396    return q^(?i)((dump|\d+)\.txt)$^;
    399397}
     
    401399
    402400sub get_default_split_exp {
    403 
     401   
    404402    # prev line is either a new line or start of dump.txt
    405403    # current line should start with url protocol and contain " key: .... http(s)/"
    406404    # \r\n for msdos eol, \n for unix
    407405   
    408     #return q^($|\r?\n)https?://\w+\s+key:\s+\w+https?/^;
    409     #return q^\r?\n(text:end:|metadata _csh_ :)\r?\n\r?\n^;
    410    
    411     #return q^(\r?\n)*https?://\w+\s+key:\s+\w+https?/\s*\r?\n^;
    412    
    413 
    414     #return q^(?:$|\r?\n\r?\n)(https?://.+?\skey:\s+.*?https?/)^;
    415 
    416 
    417     #return q^($|\r?\n\r?\n)https?://^;
    418    
    419     #return q^\r?\n(text:end:)\r?\n\r?\n^;
    420    
    421     # return q^\r?\n\s*\r?\n|\[\w+\]Record type: USmarc^;
    422 
    423    
    424     # split by default throws away delimiter
     406    # The regex return value of this method is passed into a call to perl split.
     407    # Perl's split(), by default throws away delimiter
    425408    # Any capturing group that makes up or is part of the delimiter becomes a separate element returned by split
    426409    # We want to throw away the empty newlines preceding the first line of a record "https? .... key: https?/"
     
    429412    #    https://stackoverflow.com/questions/14907772/split-but-keep-delimiter
    430413    #   - To skip the unwanted empty lines preceding the first line of a record use ?: in front of its capture group
    431     #    to discard that group:
     414    #    to discard that group:
    432415    #    https://stackoverflow.com/questions/3512471/what-is-a-non-capturing-group-in-regular-expressions
    433416    #   - Next use a positive look-ahead (?= in front of capture group, vs ?! for negative look ahead)
    434     #    to match but not capture the first line of a record (so the look-ahead matched is retained as the
    435     #    first line of the next record):
     417    #    to match but not capture the first line of a record (so the look-ahead matched is retained as the
     418    #    first line of the next record):
    436419    #    https://stackoverflow.com/questions/14907772/split-but-keep-delimiter
    437420    #    and http://www.regular-expressions.info/lookaround.html
     
    439422    #    https://stackoverflow.com/questions/11898998/how-can-i-write-a-regex-which-matches-non-greedy
    440423    return q^(?:$|\r?\n\r?\n)(?=https?://.+?\skey:\s+.*?https?/)^;
    441 
    442 }
    443 
    444 # TODO: COPIED METHOD STRAIGHT FROM MarcPlugin.pm - move to a utility perl file?
     424   
     425}
     426
     427# TODO: Copied method from MARCPlugin.pm and uncommented return statement when encoding = utf8
     428# Move to a utility perl file, since code is mostly shared?
    445429# The bulk of this function is based on read_line in multiread.pm
    446430# Unable to use read_line original because it expects to get its input
    447431# from a file.  Here the line to be converted is passed in as a string
     432
     433# TODO:
     434# Is this function even applicable to NutchTextDumpPlugin?
     435# I get errors in this method when encoding is utf-8 in the decode step.
     436# I get warnings/errors somewhere in this file (maybe also at decode) when encoding is windows-1252.
    448437
    449438sub to_utf8
     
    454443    if ($encoding eq "utf8") {
    455444    # nothing needs to be done
    456     #return $line;
    457     } elsif ($encoding eq "iso_8859_1") {
     445    return $line;
     446    } elsif ($encoding eq "iso_8859_1" || $encoding eq "windows-1252") { # TODO: do this also for windows-1252?
    458447    # we'll use ascii2utf8() for this as it's faster than going
    459448    # through convert2unicode()
     
    462451    } else {
    463452
    464     # everything else uses unicode::convert2unicode
    465     $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
     453    # everything else uses unicode::convert2unicode
     454    $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
    466455    }
    467456    # At this point $line is a binary byte string
     
    469458    # Unicode aware pattern matching can be used.
    470459    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
    471 
     460   
    472461    return decode ("utf8", $line);
    473462}
     
    480469    my $self = shift (@_);
    481470    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
    482 
     471   
    483472    my $outhandle = $self->{'outhandle'};
    484473    my $filename = &util::filename_cat($base_dir, $file);
    485474
    486475    my $cursection = $doc_obj->get_top_section();
    487    
    488    
     476    
     477   
    489478    #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n";
    490479   
    491480   
    492     # (1) parse out the metadata of this record
    493     my $metaname;
    494     my $encoding;
    495     my $title_meta;
    496    
    497     my $line_index = 0;
    498     my $text_start_index = -1;
    499     my @lines = split(/(?:\r?\n)+/, $$textref);
    500    
    501     foreach my $line (@lines) {
    502         # first line is special and contains the URL (no metaname)
    503         # and the inverted URL labelled with metaname "key"
    504         if($line =~ m/^https?/ && $line =~ m/\s+key:\s+/) {
    505             my @vals = split(/key:/, $line);
    506             my $url = $vals[0];
    507             my $key = $vals[1];
    508             # trim whitespace https://perlmaven.com/trim
    509             $url = &util::trim($url); #=~ s/^\s+|\s+$//g;
    510             $key = &util::trim($key); #=~ s/^\s+|\s+$//g;
    511            
    512             # if we have a keep_urls hash, then only process records of whitelisted urls
    513             if(defined $self->{'keep_urls'} && !$self->{'keep_urls'}->{$url}) {
    514                 # URL not whitelisted, so stop processing this record
    515                 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): discarding record for URL not whitelisted: $url\n"
    516                     if $self->{'verbosity'} > 3;
    517                 return 0;
    518             } else {
    519                 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): processing record of whitelisted URL $url...\n"
    520                     if $self->{'verbosity'} > 3;
    521             }
    522             $doc_obj->add_utf8_metadata ($cursection, "ex.srcURL", $url);
    523             $doc_obj->add_utf8_metadata ($cursection, "ex.key", $key);
    524 
    525             # # let's also set the domain from the URL, as that will make a
    526             # # more informative bookshelf label than siteID
    527             # my $domain = $url;
    528             # # remove protocol:// and everything after and including subsequent slash
    529             # $domain =~ s@^https?://([^/]+).*@$1@;
    530             # #$domain =~ s@^https?://@@; # remove protocol
    531             # #$domain =~ s@/.*$@@; # now remove everything after first slash
    532             # my $protocol = $url;# =~ s@(^https?).*$@@;
    533             # $protocol =~ s@(^https?).*$@$1@;
    534             # $domain = $protocol."://".$domain;
    535             # #$domain =~ s@[\.\-]@@g;
    536             # #$domain = "pinky";
    537             # $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain);
    538 
    539 
    540             # let's also set the domain from the URL, as that will make a
    541             # more informative bookshelf label than siteID
    542             # For complete domain, keep protocol:// and every non-slash after,
    543             # without requiring presence of subsequent slash
    544             # https://stackoverflow.com/questions/3652527/match-regex-and-assign-results-in-single-line-of-code
    545             # Can clean up protocol and www. in bookshelf's remove_prefix option
    546 
    547             my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@;
    548 
    549             # For domain, the following removes protocol:// and
    550             # everything after and including subsequent slash, without requiring subsequent slash
    551             #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works
    552             #my ($protocol, $basicdomain) = $url =~ m@(^https?)://([^/]+).*@; # Should work
    553             #my $domain = $protocol."://".$basicdomain;
    554             $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain);
    555             $doc_obj->add_utf8_metadata ($cursection, "ex.basicDomain", $basicDomain);
    556            
     481    # (1) parse out the metadata of this record
     482    my $metaname;
     483    my $encoding;
     484    my $title_meta;
     485   
     486    my $line_index = 0;
     487    my $text_start_index = -1;
     488    my @lines = split(/(?:\r?\n)+/, $$textref);
     489   
     490    foreach my $line (@lines) {
     491    # first line is special and contains the URL (no metaname)
     492    # and the inverted URL labelled with metaname "key"
     493    if($line =~ m/^https?/ && $line =~ m/\s+key:\s+/) {
     494        my @vals = split(/key:/, $line);
     495        # get url and key, and trim whitespace simultaneously
     496        my $url = &util::trim($vals[0]);
     497        my $key = &util::trim($vals[1]);
     498       
     499        # if we have a keep_urls hash, then only process records of whitelisted urls
     500        if(defined $self->{'keep_urls'} && !$self->{'keep_urls'}->{$url}) {
     501        # URL not whitelisted, so stop processing this record
     502        print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): discarding record for URL not whitelisted: $url\n"
     503            if $self->{'verbosity'} > 3;
     504        return 0;
     505        } else {
     506        print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): processing record of whitelisted URL $url...\n"
     507            if $self->{'verbosity'} > 3;
     508        }
     509        $doc_obj->add_utf8_metadata ($cursection, "ex.srcURL", $url);
     510        $doc_obj->add_utf8_metadata ($cursection, "ex.key", $key);
     511       
     512       
     513        # let's also set the domain from the URL, as that will make a
     514        # more informative bookshelf label than siteID
     515        # For complete domain, keep protocol:// and every non-slash after.
     516        # (This avoids requiring presence of subsequent slash)
     517        # https://stackoverflow.com/questions/3652527/match-regex-and-assign-results-in-single-line-of-code
     518        # Can clean up protocol and www. in List classifier's bookshelf's remove_prefix option
     519        # or can build classifier on basicDomain instead.
     520       
     521        my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@;       
     522        #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works
     523        $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain);
     524        $doc_obj->add_utf8_metadata ($cursection, "ex.basicDomain", $basicDomain);
     525       
     526    }
     527    # check for full text
     528    elsif ($line =~ m/text:start:/) {
     529        $text_start_index = $line_index;
     530        last; # if we've reached the full text portion, we're past the metadata portion of this record
     531    }
     532    elsif($line =~ m/^[^:]+:.+$/) { # look for meta #elsif($line =~ m/^[^:]+:[^:]+$/) { # won't allow protocol://url in metavalue
     533        my @metakeyvalues = split(/:/, $line); # split on first :
     534       
     535        my $metaname = shift(@metakeyvalues);
     536        my $metavalue = join("", @metakeyvalues);
     537       
     538        # skip "metadata _rs_" and "metadata _csh_" as these contain illegible characters for values
     539        if($metaname !~ m/metadata\s+_(rs|csh)_/) {
     540       
     541        # trim whitespace
     542        $metaname = &util::trim($metaname);
     543        $metavalue = &util::trim($metavalue);
     544       
     545        if($metaname eq "title") { # TODO: what to do about "title: null" cases?
     546            ##print STDERR "@@@@ Found title: $metavalue\n";
     547            #$metaname = "Title"; # will set "title" as "Title" metadata instead
     548            # TODO: treat title metadata specially by using character encoding to store correctly?
     549           
     550            # Won't add Title metadata to docObj until after all meta is processed,
     551            # when we'll know encoding and can process title meta
     552            $title_meta = $metavalue;
     553            $metavalue = ""; # will force ex.Title metadata to be added AFTER for loop
    557554        }
    558         # check for full text
    559         elsif ($line =~ m/text:start:/) {
    560             $text_start_index = $line_index;
    561             last; # if we've reached the full text portion, we're past the metadata portion of this record
     555        elsif($metaname =~ m/CharEncodingForConversion/) { # TODO: or look for "OriginalCharEncoding"?
     556            ##print STDERR "@@@@ Found encoding: $metavalue\n";
     557            $encoding = $metavalue; # TODO: should we use this to interpret the text and title in the correct encoding and convert to utf-8?
     558           
     559            if($encoding eq "utf-8") {
     560            $encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8"
     561            } else {
     562            print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8: $encoding\n";
     563            }
     564           
     565           
    562566        }
    563         elsif($line =~ m/^[^:]+:.+$/) { # look for meta #elsif($line =~ m/^[^:]+:[^:]+$/) { # won't allow protocol://url in metavalue
    564             my @metakeyvalues = split(/:/, $line);
    565             #my $metaname = $metakeyvalues[0];
    566             #my $metavalue = $metakeyvalues[1];
    567             my $metaname = shift(@metakeyvalues);
    568             my $metavalue = join("", @metakeyvalues);
    569            
    570             # skip "metadata _rs_" and "metadata _csh_" as these contain illegible characters for values
    571             if($metaname !~ m/metadata\s+_(rs|csh)_/) {
    572            
    573                 # trim whitespace
    574                 $metaname = &util::trim($metaname); #=~ s/^\s+|\s+$//g;
    575                 $metavalue = &util::trim($metavalue); #=~ s/^\s+|\s+$//g;
    576                
    577                 if($metaname eq "title") { # TODO: what to do about "title: null" cases?
    578                     ##print STDERR "@@@@ Found title: $metavalue\n";
    579                     #$metaname = "Title"; # set this as ex.Title metadata
    580                     # TODO: treat title metadata specially by using character encoding to store correctly?
    581                    
    582                     # won't add Title metadata to docObj until after all meta is processed, when we'll know encoding and can process title meta
    583                     $title_meta = $metavalue;
    584                     $metavalue = "";
    585                 }
    586                 elsif($metaname =~ m/CharEncodingForConversion/) { # TODO: or look for "OriginalCharEncoding"?
    587                     ##print STDERR "@@@@ Found encoding: $metavalue\n";
    588                     $encoding = $metavalue; # TODO: should we use this to interpret the text and title in the correct encoding and convert to utf-8?
    589                    
    590                     if($encoding eq "utf-8") {
    591                         $encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8"
    592                     } else {
    593                         print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8: $encoding\n";
    594                     }
    595                    
    596                    
    597                 }
    598                
    599                 # move occurrences of "marker " or "metadata " strings at start of metaname to end
    600                 #$metaname =~ s/^(marker|metadata)\s+(.*)$/$2$1/;
    601                 # remove "marker " or "metadata " strings from start of metaname
    602                 $metaname =~ s/^(marker|metadata)\s+//;
    603                 # remove underscores and all remaining spaces in metaname
    604                 $metaname =~ s/[ _]//g;         
    605                
    606                 # add meta to docObject if both metaname and metavalue are non-empty strings
    607                 if($metaname ne "" && $metavalue ne "") { # && $metaname ne "rs" && $metaname ne "csh") {
    608                     $doc_obj->add_utf8_metadata ($cursection, "ex.".$metaname, $metavalue);
    609                     #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i;
    610                 }
    611                
    612             }
    613         } elsif ($line !~ m/^\s*$/) { # Not expecting any other type of non-empty line (or even empty lines)
    614             print STDERR "NutchTextDump line not recognised as URL meta, other metadata or text content:\n\t$line\n";
     567       
     568        # move occurrences of "marker " or "metadata " strings at start of metaname to end
     569        #$metaname =~ s/^(marker|metadata)\s+(.*)$/$2$1/;
     570        # remove "marker " or "metadata " strings from start of metaname
     571        $metaname =~ s/^(marker|metadata)\s+//;
     572        # remove underscores and all remaining spaces in metaname
     573        $metaname =~ s/[ _]//g;         
     574       
     575        # add meta to docObject if both metaname and metavalue are non-empty strings
     576        if($metaname ne "" && $metavalue ne "") { # && $metaname ne "rs" && $metaname ne "csh") {
     577            $doc_obj->add_utf8_metadata ($cursection, "ex.".$metaname, $metavalue);
     578            #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i;
    615579        }
    616580       
    617         $line_index++;
     581        }
     582    } elsif ($line !~ m/^\s*$/) { # Not expecting any other type of non-empty line (or even empty lines)
     583        print STDERR "NutchTextDump line not recognised as URL meta, other metadata or text content:\n\t$line\n";
    618584    }
    619585   
    620    
     586    $line_index++;
     587    }
     588   
     589   
    621590    # Add fileFormat as the metadata
    622591    $doc_obj->add_metadata($cursection, "FileFormat", "NutchDumpTxt");
    623    
    624     # Correct title metadata using encoding, if we have $encoding at last
    625     # $title_meta = $self->to_utf8($encoding, $title_meta) if $encoding;
    626     # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character
    627     # Error message: "Perl Encode.pm cannot decode string with wide character"
    628     # "That error message is saying that you have passed in a string that has already been decoded
    629     # (and contains characters above codepoint 255). You can't decode it again."
    630     if($title_meta && $title_meta ne "" && $title_meta ne "null") {
    631         $title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding && $encoding ne "utf8");
    632     } else { # if we have "null" as title metadata, set it to the record URL?
    633         #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL");
    634         #print STDERR "@@@@ null title to be replaced with ".$srcURLs->[0]."\n";
    635         #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0);
    636         my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param
    637         if(defined $srcURL) {
    638             print STDERR "@@@@ null/empty title to be replaced with ".$srcURL."\n"
    639                 if $self->{'verbosity'} > 3;
    640             $title_meta = $srcURL;
    641         }
     592   
     593    # Correct title metadata using encoding, if we have $encoding at last
     594    # $title_meta = $self->to_utf8($encoding, $title_meta) if $encoding;
     595    # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character
     596    # Error message: "Perl Encode.pm cannot decode string with wide character"
     597    # "That error message is saying that you have passed in a string that has already been decoded
     598    # (and contains characters above codepoint 255). You can't decode it again."
     599    if($title_meta && $title_meta ne "" && $title_meta ne "null") {
     600    $title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding);
     601    } else { # if we have "null" as title metadata, set it to the record URL?
     602    #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL");
     603    #print STDERR "@@@@ null title to be replaced with ".$srcURLs->[0]."\n";
     604    #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0);
     605    my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param
     606    if(defined $srcURL) {
     607        print STDERR "@@@@ null/empty title to be replaced with ".$srcURL."\n"
     608        if $self->{'verbosity'} > 3;
     609        $title_meta = $srcURL;
    642610    }
    643     $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta);
    644    
    645 
    646 
     611    }
     612    $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta);
     613   
     614   
     615    # When importOption OIDtype = dirname, the base_OID will be that dirname
     616    # which was crafted to be the siteID. However, because our siteID is all numeric,
     617    # a D gets prepended to create baseOID. Remove the starting 'D' to get actual siteID.
    647618    my $siteID = $self->get_base_OID($doc_obj);
    648619    #print STDERR "BASE OID: " . $self->get_base_OID($doc_obj) . "\n";
    649     # remove the 'D' that was inserted by a superclass in front of the all-numeric siteID to become baseOID:
    650620    $siteID =~ s/^D//;
    651621    $doc_obj->add_utf8_metadata ($cursection, "ex.siteID", $siteID);
    652 
    653    
    654     # (2) parse out text of this record
    655     # if($text_start_index != -1 && pop(@lines) =~ m/text:end:/) { # we only have text content if there were "text:start:" and "text:end:" markers.
    656                                                                 # # TODO: are we guaranteed popped line is text:end: and not empty/newline?
    657         # @lines = splice(@lines,0,$text_start_index+1); # just keep every line AFTER text:start:, have already removed (popped) "text:end:"
    658        
    659         # # glue together remaining lines, if there are any, into textref
    660         # # https://stackoverflow.com/questions/7406807/find-size-of-an-array-in-perl
    661         # if(scalar (@lines) > 0) {
    662             # # TODO: do anything with $encoding to convert line to utf-8?
    663             # foreach my $line (@lines) {               
    664                 # $line = $self->to_utf8($encoding, $line) if $encoding; #if $encoding ne "utf-8";
    665                 # $$textref .= $line."\n";
    666             # }
    667         # }
    668         # $$textref = "<pre>\n".$$textref."</pre>";
    669     # } else {
    670         # print STDERR "WARNING: NutchTextDumpPlugin::process: had found a text start marker but not text end marker.\n");
    671         # $$textref = "<pre></pre>";
    672     # }
     622   
     623   
     624    # (2) parse out text of this record
     625    # if($text_start_index != -1 && pop(@lines) =~ m/text:end:/) { # we only have text content if there were "text:start:" and "text:end:" markers.
     626    #   # TODO: are we guaranteed popped line is text:end: and not empty/newline?
     627    #   @lines = splice(@lines,0,$text_start_index+1); # just keep every line AFTER text:start:, have already removed (popped) "text:end:"
    673628   
    674     my $no_text = 1;
    675     if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record
    676         if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) {
    677             $$textref = $1;
    678             if($$textref !~ m/^\s*$/) {
    679                 $$textref = $self->to_utf8($encoding, $$textref) if ($encoding && $encoding ne "utf8");
    680                 $$textref = "<pre>\n".$$textref."\n</pre>";
    681                 $no_text = 0;
    682             }
    683         }
     629    #   # glue together remaining lines, if there are any, into textref
     630    #   # https://stackoverflow.com/questions/7406807/find-size-of-an-array-in-perl
     631    #   if(scalar (@lines) > 0) {
     632    #       # TODO: do anything with $encoding to convert line to utf-8?
     633    #       foreach my $line (@lines) {             
     634    #       $line = $self->to_utf8($encoding, $line) if $encoding; #if $encoding ne "utf-8";
     635    #       $$textref .= $line."\n";
     636    #       }
     637    #   }
     638    #   $$textref = "<pre>\n".$$textref."</pre>";
     639    # } else {
     640    #   print STDERR "WARNING: NutchTextDumpPlugin::process: had found a text start marker but not text end marker.\n";
     641    #   $$textref = "<pre></pre>";
     642    # }
     643   
     644    # (2) parse out text of this record
     645    my $no_text = 1;
     646    if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record
     647    if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) {
     648        $$textref = $1;
     649        if($$textref !~ m/^\s*$/) {
     650        $$textref = $self->to_utf8($encoding, $$textref) if ($encoding);
     651        $$textref = "<pre>\n".$$textref."\n</pre>";
     652        $no_text = 0;
     653        }
    684654    }
    685     if($no_text) {
    686         $$textref = "<pre></pre>";
    687     }
    688 
    689         # Debugging
    690         # To avoid "wide character in print" messages for debugging, set binmode of handle to utf8/encoding
    691     # https://stackoverflow.com/questions/15210532/use-of-use-utf8-gives-me-wide-character-in-print
    692         # if ($self->{'verbosity'} > 3) {
    693     #     if($encoding && $encoding eq "utf8") {
    694     #   binmode STDERR, ':utf8';
    695     #     }
    696 
    697     #     print STDERR "TITLE: $title_meta\n";
    698     #     print STDERR "ENCODING = $encoding\n" if $encoding;
    699     #     #print STDERR "---------------\nTEXT CONTENT\n---------\n", $$textref, "\n------------------------\n";
    700     # }
    701    
    702 
     655    }
     656    if($no_text) {
     657    $$textref = "<pre></pre>";
     658    }
     659   
     660    # Debugging
     661    # To avoid "wide character in print" messages for debugging, set binmode of handle to utf8/encoding
     662    # https://stackoverflow.com/questions/15210532/use-of-use-utf8-gives-me-wide-character-in-print
     663    # if ($self->{'verbosity'} > 3) {
     664    #     if($encoding && $encoding eq "utf8") {
     665    #   binmode STDERR, ':utf8';
     666    #     }
     667   
     668    #     print STDERR "TITLE: $title_meta\n";
     669    #     print STDERR "ENCODING = $encoding\n" if $encoding;
     670    #     #print STDERR "---------------\nTEXT CONTENT\n---------\n", $$textref, "\n------------------------\n";
     671    # }
     672   
     673   
    703674    $doc_obj->add_utf8_text($cursection, $$textref);
    704675   
Note: See TracChangeset for help on using the changeset viewer.