- Timestamp:
- 2020-05-27T19:10:44+12:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm
r34125 r34126 189 189 190 190 191 # To get all the isMRI results, I ran Robo-3T against our mongodb as 192 # in the instructions at http://trac.greenstone.org/browser/other-projects/maori-lang-detection/MoreReading/mongodb.txt 193 # Then I launched Robo-3T and connected to the mongodb 194 # 195 # Then in the "ateacrawldata" database, I ran the following queries 196 # to get a URL listing of all the Webpages where isMRI = true as determined 197 # by apache openNLP. 198 # 199 #db.getCollection('Webpages').find({isMRI:true}).count(); 200 #7830 201 # 202 #db.getCollection('Webpages').find({isMRI:true},{URL: 1, _id: 0}); 203 # 204 #Then I set robo-3T's output display to display 8000 results on a page, then copied the results into this file below. 205 # 206 # I cleaned out all the JSON from the results using regex in Notepad++. 207 # This then becomes our urls.txt file, which I put into the cc nutch crawl 208 # GS3 collection's etc folder under the name isMRI_urls.txt, 209 # to consider processing only webpages apache Open-NLP detected as isMRI 210 # into our collection. 211 # Remember to configure the NutchTextDumpPlugin with option "keep_urls_file" = isMRI_urls.txt to make use of this. 212 191 213 sub BEGIN { 192 214 @NutchTextDumpPlugin::ISA = ('SplitTextFile'); … … 387 409 } 388 410 389 # if keep_urls hash is empty, ensure it is undefined from this point onward 411 # If keep_urls hash is empty, ensure it is undefined from this point onward 412 # Use if(!keys %hash) to SECURELY test for an empty hash 390 413 # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl 391 my %urls_map = $self->{'keep_urls'}; 414 # 415 # But may not do: keys $hashref, only: keys %hash. 416 # Unable to work out how to dereference the hashref that is $self->{'keep_urls'}, 417 # in order for me to then finally get the keys of the hashmap it refers to 418 # Googled: perl convert reference to hashmap 419 # The way to dereference hashref and get the keys is at https://www.thegeekstuff.com/2010/06/perl-hash-reference/ 420 # keys % { $hash_ref }; 421 my $hashmap_ref = $self->{'keep_urls'}; 422 my %urls_map = %$hashmap_ref; 392 423 if(!keys %urls_map) { 393 424 $self->{'keep_urls'} = undef;
Note:
See TracChangeset
for help on using the changeset viewer.