source: main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm@ 34124

Last change on this file since 34124 was 34124, checked in by ak19, 4 years ago

Decoding the title and text using the encoding seemed to have turned into a problem, both for windows 1252 but also <td valign=top>

File size: 29.9 KB
Line 
1###########################################################################
2#
3# NutchTextDumpPlugin.pm -- plugin for dump.txt files generated by Nutch
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 2002 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# This plugin originally created to process Nutch dump.txt files produced from recrawling commoncrawl (CC)
28# results for pages detected by CC as being in Maori.
29# It splits each web site's dump.txt into its individual records: as each record represents a web page,
30# this produces one greenstone document per web page.
31#
32# For a commoncrawl collection of siteID-labelled folders containing dump.txt files each,
33# - set <importOption name="OIDtype" value="dirname"/>
34# - Create 2 List browsing classifiers (with bookshelf_type set to always) on ex.siteID and ex.srcDomain
35# both sorted by ex.srcURL, and an ex.Title classifier.
36# For the ex.srcDomain classifier, set removeprefix to: https?\:\/\/(www\.)?
37# An alternative is to build that List classifier on ex.basicDomain instead of ex.srcDomain.
38# Set this List classifier's "partition_type_within_level" option to "per_letter".
39# - Add search indexes on text (default), Title, basicDomain, siteID, Identifier, srcURL (not working)
40#
41# Finally, in the "display" format statement, add the following before the "wrappedSectionText" to
42# display the most relevant metadata of each record:
43 # <gsf:template name="documentContent">
44 # <div id="nutch-dump-txt-record">
45 # <h3>Record:</h3>
46 # <br/>
47 # <dl>
48 # <dt>URL:</dt>
49 # <dd>
50 # <gsf:metadata name="srcURL"/>
51 # </dd>
52 # <dt>Title:</dt>
53 # <dd>
54 # <gsf:metadata name="ex.Title"/>
55 # </dd>
56 # <dt>Identifier:</dt>
57 # <dd>
58 # <gsf:metadata name="Identifier"/>
59 # </dd>
60 # <dt>SiteID:</dt>
61 # <dd>
62 # <gsf:metadata name="siteID"/>
63 # </dd>
64 # <dt>Status:</dt>
65 # <dd>
66 # <gsf:metadata name="status"/>
67 # </dd>
68 # <dt>ProtocolStatus:</dt>
69 # <dd>
70 # <gsf:metadata name="protocolStatus"/>
71 # </dd>
72 # <dt>ParseStatus:</dt>
73 # <dd>
74 # <gsf:metadata name="parseStatus"/>
75 # </dd>
76 # <dt>CharEncodingForConversion:</dt>
77 # <dd>
78 # <gsf:metadata name="CharEncodingForConversion"/>
79 # </dd>
80 # <dt>OriginalCharEncoding:</dt>
81 # <dd>
82 # <gsf:metadata name="OriginalCharEncoding"/>
83 # </dd>
84 # </dl>
85 # </div>
86
87# + DONE: remove illegible values for metadata _rs_ and _csh_ in the example below before
88# committing, in case their encoding affects the loading/reading in of this perl file.
89#
90# Example record in dump.txt to process:
91 # https://www.whanau-tahi.school.nz/ key: nz.school.whanau-tahi.www:https/
92 # OR: http://yutaka.it-n.jp/apa/750010010.html key: jp.it-n.yutaka:http/apa/750010010.html
93 # baseUrl: null
94 # status: 2 (status_fetched)
95 # fetchTime: 1575199241154
96 # prevFetchTime: 1572607225779
97 # fetchInterval: 2592000
98 # retriesSinceFetch: 0
99 # modifiedTime: 0
100 # prevModifiedTime: 0
101 # protocolStatus: SUCCESS, args=[]
102 # signature: d84c84ccf0c86aa16a19e03cb1fc5827
103 # parseStatus: success/ok (1/0), args=[]
104 # title: Te Kura Kaupapa Māori o Te Whānau Tahi
105 # score: 1.0
106 # marker _injmrk_ : y
107 # marker _updmrk_ : 1572607228-9584
108 # marker dist : 0
109 # reprUrl: null
110 # batchId: 1572607228-9584
111 # metadata CharEncodingForConversion : utf-8
112 # metadata OriginalCharEncoding : utf-8
113 # metadata _rs_ :
114 # metadata _csh_ :
115 # text:start:
116 # Te Kura Kaupapa Māori o Te Whānau Tahi He mihi He mihi Te Kaupapa Ngā Tāngata Te Kākano Te Pihinga Te Tipuranga Te Puāwaitanga Te Tari Te Poari Matua Whakapā mai He mihi He mihi Te Kaupapa Ngā Tāngata Te Kākano Te Pihinga Te Tipuranga Te Puāwaitanga Te Tari Te Poari Matua Whakapā mai TE KURA KAUPAPA MĀORI O TE WHĀNAU TAHI He mihi Kei te mōteatea tonu nei ngā mahara ki te huhua kua mene atu ki te pō, te pōuriuri, te pōtangotango, te pō oti atu rā. Kua rite te wāhanga ki a rātou, hoki mai ki te ao tūroa nei Ko Io Matua Kore te pūtaketanga, te pūkaea, te pūtātara ka rangona whānuitia e te ao. Ko tāna ko ngā whetū, te marama, te haeata ki a Tamanui te rā. He atua i whakateretere mai ai ngā waka i tawhiti nui, i tawhiti roa, i tawhiti mai rā anō. Kei nga ihorei, kei ngā wahapū, kei ngā pukumahara, kei ngā kanohi kai mātārae o tō tātou nei kura Aho Matua, Te Kura Kaupapa Māori o Te Whanau Tahi. Anei rā te maioha ki a koutou katoa e pūmau tonu ki ngā wawata me ngā whakakitenga i whakatakotoria e ngā poupou i te wā i a rātou. Ka whakanuia hoki te toru tekau tau o tēnei kura mai i tōna orokohanga timatanga tae noa ki tēnei wā Ka pūmau tōnu mātou ki te whakatauki o te kura e mea ana “Poipoia ō tātou nei pūmanawa” Takiritia tonutia te ra ki runga i Te Kura Kaupapa Maori o Te Whanau Tahi . Back to Top " Poipoia ō tātou nei pūmanawa -  Making our potential a reality "   ©  Te Kura Kaupapa Māori o Te Whānau Tahi, 2019  Cart ( 0 )
117 # text:end:
118 #
119 # https://www.whanau-tahi.school.nz/cart key: nz.school.whanau-tahi.www:https/cart
120 # baseUrl: null
121 # status: 2 (status_fetched)
122 # ...
123#
124# - Some records may have empty text content between the text:start: and text:end: markers,
125# while other records may be missing these markers along with any text.
126# - Metadata is of the form key : value, but some metadata values contain ":", for example
127# "protocolStatus" metadata can contain a URL for value, including protocol that contains ":".
128# - metadata _rs_ and _csh_ contain illegible values, so this code discards them when storing metadata.
129#
130# If you provide a keep_urls_file when configuring NutchTextDumpPlugin, then if relative the path is relative
131# it will check the collection's etc folder for a urls.txt file.
132
133
134package NutchTextDumpPlugin;
135
136use SplitTextFile;
137
138use Encode;
139use unicode;
140use util;
141
142use strict;
143no strict 'refs'; # allow filehandles to be variables and viceversa
144
145# TODO:
146# + 1. Split each dump.txt file into its individual records as individual docs
147# + 2. Store the meta of each individual record/doc
148# ?3. Name each doc, siteID.docID else HASH internal text. See EmailPlugin?
149# - In SplitTextFile::read(), why is $segment which counts discarded docs too used to add record ID
150# rather than $count which only counts included docs? I am referring to code:
151# $self->add_OID($doc_obj, $id, $segment);
152# The way I've solved this is by setting the OIDtype importOption. Not sure if this is what was required.
153# + 4. Keep a map of all URLs seen - whitelist URLs.
154# + 5. Implement the optional input file of URLs: if infile provided, keep only those records
155# whose URLs are in the map. Only these matching records should become docs.
156# 6. Rebuild full collection of all dump.txt files with this collection design.
157#
158# TIDY UP:
159# + Create util::trim()
160# + Add to perl's strings.properties: NutchTextDumpPlugin.keep_urls_file
161#
162# CLEANUP:
163# Remove MetadataRead functions and inheritance
164#
165# QUESTIONS:
166# - encoding = utf-8, changed to "utf8" as required by copied to_utf8(str) method. Why does it not convert
167# the string parameter but fails in decode() step? Is it because the string is already in UTF8?
168# - Problem converting text with encoding in full set of nutch dump.txt when there encoding is windows-1252.
169# - TODOs
170#
171# - Should I add metadata as "ex."+meta or as meta? e.g. ex.srcURL or srcURL?
172# - Want to read in keep_urls_file, maintaining a hashmap of its URLs, only on import, isn't that correct?
173# Then how can I initialise this only once and only during import? constructor and init() methods are called during buildcol too.
174# For now, I've done it in can_proc_this_file() but there must be a more appropriate place and correct way to do this?
175# - why can't I do doc_obj->get_meta_element($section, "ex.srcURL") but have to pass "srcURL" and 1 to ignore
176# namespace?
177# - in collectionConfig file I have to leave out ex. prefix for all but Title, why?
178# - in GLI, browsing classifier sort_leaf options, "ex.srcURL" appears only as "ex.srcurl" (lowercased). Why?
179# - On the other hand, in GLI's search indexes, both ex.srcurl and ex.srcURL appear. But only building
180# with an index on ex.srcURL provides a search option in the search box. But then searching on an existing
181# srcURL produces 0 results anyway.
182# - Is this all because I am naming my ex metadata names wrongly? e.g. ex.srcURL, ex.siteID, ex.srcDomain.
183#
184# CHECK:
185# - title fallback is URL.
186# - util::tidy_up_OID() prints warning. SiteID is foldername and OIDtype=dirname, so fully numeric
187# siteID to OID conversion results in warning message that siteID is fully numeric and gets 'D' prefixed.
188# Is this warning still necessary?
189
190
191sub BEGIN {
192 @NutchTextDumpPlugin::ISA = ('SplitTextFile');
193 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
194}
195
196my $arguments =
197 [ { 'name' => "keep_urls_file",
198 'desc' => "{NutchTextDumpPlugin.keep_urls_file}",
199 'type' => "string",
200 #'deft' => "urls.txt",
201 'reqd' => "no" },
202 { 'name' => "process_exp",
203 'desc' => "{BaseImporter.process_exp}",
204 'type' => "regexp",
205 'reqd' => "no",
206 'deft' => &get_default_process_exp() },
207 { 'name' => "split_exp",
208 'desc' => "{SplitTextFile.split_exp}",
209 'type' => "regexp",
210 'reqd' => "no",
211 'deft' => &get_default_split_exp() }
212 ];
213
214my $options = { 'name' => "NutchTextDumpPlugin",
215 'desc' => "{NutchTextDumpPlugin.desc}",
216 'abstract' => "no",
217 'inherits' => "yes",
218 'explodes' => "yes",
219 'args' => $arguments };
220
221sub new {
222 my ($class) = shift (@_);
223 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
224 push(@$pluginlist, $class);
225
226 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
227 push(@{$hashArgOptLists->{"OptList"}},$options);
228
229 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
230
231 if ($self->{'info_only'}) {
232 # don't worry about the options
233 return bless $self, $class;
234 }
235
236 $self->{'keep_urls_processed'} = 0;
237 $self->{'keep_urls'} = undef;
238
239 #return bless $self, $class;
240 $self = bless $self, $class;
241
242 # Can only call any methods on $self AFTER the bless operation above
243 #$self->setup_keep_urls(); # want to set up the keep_urls hashmap only once, so have to do it here (init is also called by buildcol)
244
245 return $self;
246}
247
248# sub init {
249 # my $self = shift (@_);
250 # my ($verbosity, $outhandle, $failhandle) = @_;
251
252 # if(!$self->{'keep_urls_file'}) {
253 # my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" .
254 # " No records will be filtered.\n";
255 # print $outhandle $msg if ($verbosity > 2);
256
257 # $self->SUPER::init(@_);
258 # return;
259 # }
260
261 # # read in the keep urls files
262 # my $keep_urls_file = &util::locate_config_file($self->{'keep_urls_file'});
263 # if (!defined $keep_urls_file)
264 # {
265 # my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" .
266 # " No records will be filtered.\n";
267
268 # print $outhandle $msg;
269
270 # $self->{'keep_urls'} = undef;
271 # # Not an error if there's no $keep_urls_file: it just means all records
272 # # in dump.txt will be processed.
273 # }
274 # else {
275 # #$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle);
276 # #$self->{'keep_urls'} = {};
277 # $self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle);
278 # }
279
280 ## if($self->{'keep_urls'} && $verbosity > 2) {
281 # # print STDERR "@@@@ keep_urls hash map contains:\n";
282 # # map { print STDERR $_."=>".$self->{'keep_urls'}->{$_}."\n"; } keys %{$self->{'keep_urls'}};
283 ## }
284 # $self->SUPER::init(@_);
285# }
286
287sub setup_keep_urls {
288 my $self = shift (@_);
289
290 my $verbosity = $self->{'verbosity'};
291 my $outhandle = $self->{'outhandle'};
292 my $failhandle = $self->{'failhandle'};
293
294 $self->{'keep_urls_processed'} = 1; # flag to track whether this method has been called already during import
295
296 #print $outhandle "@@@@ In NutchTextDumpPlugin::setup_keep_urls()\n";
297
298 if(!$self->{'keep_urls_file'}) {
299 my $msg = "NutchTextDumpPlugin INFO: No urls file provided.\n" .
300 " No records will be filtered.\n";
301 print $outhandle $msg if ($verbosity > 2);
302
303 return;
304 }
305
306 # read in the keep urls files
307 my $keep_urls_file = &util::locate_config_file($self->{'keep_urls_file'});
308 if (!defined $keep_urls_file)
309 {
310 my $msg = "NutchTextDumpPlugin INFO: Can't locate urls file $keep_urls_file.\n" .
311 " No records will be filtered.\n";
312
313 print $outhandle $msg;
314
315 $self->{'keep_urls'} = undef;
316 # TODO: Not a fatal error if $keep_urls_file can't be found: it just means all records
317 # in dump.txt will be processed?
318 }
319 else {
320 #$self->{'keep_urls'} = $self->parse_keep_urls_file($keep_urls_file, $outhandle);
321 #$self->{'keep_urls'} = {};
322 $self->parse_keep_urls_file($keep_urls_file, $outhandle, $failhandle);
323 }
324
325 #if(defined $self->{'keep_urls'}) {
326 # print STDERR "@@@@ keep_urls hash map contains:\n";
327 # map { print STDERR $_."=>".$self->{'keep_urls'}->{$_}."\n"; } keys %{$self->{'keep_urls'}};
328 #}
329
330}
331
332# TODO: This is an ugly way to do this anda non-intuitive place to do this. Is there a better way?
333# Overriding can_process_this_file() in order to avoid setting up the keep_urls hashmap during
334# buildcol.pl. We only want to setup the hash during import.
335# During buildcol, this method is called with directories and not files and this method will return
336# false as a result. So when it returns true, it will be import.pl, and we check whether we haven't
337# already setup the keep_urls map. If the keep urls file has not yet been processed, then we set up
338# the hashmap once.
339sub can_process_this_file {
340 my $self = shift(@_);
341 my ($filename) = @_;
342 my $can_process_return_val = $self->SUPER::can_process_this_file(@_);
343
344 # We want to load in the keep_urls_file and create the keep_urls hashmap only once, during import
345 # Because the keep urls file can be large and it and the hashmap serve no purpose during buildcol.pl.
346 # Check whether we've already processed the file/built the hashmap, as we don't want to do this
347 # more than 1 time even within just the import cycle.
348 if($can_process_return_val && !$self->{'keep_urls_processed'}) { #!defined $self->{'keep_urls'}) {
349 $self->setup_keep_urls();
350 }
351
352 return $can_process_return_val;
353
354}
355
356sub parse_keep_urls_file {
357 my $self = shift (@_);
358 my ($urls_file, $outhandle, $failhandle) = @_;
359
360 # https://www.caveofprogramming.com/perl-tutorial/perl-hashes-a-guide-to-associative-arrays-in-perl.html
361 # https://stackoverflow.com/questions/1817394/whats-the-difference-between-a-hash-and-hash-reference-in-perl
362 $self->{'keep_urls'} = {}; # hash reference init to {}
363
364 # What if it is a very long file of URLs? Need to read a line at a time!
365 #my $contents = &FileUtils::readUTF8File($urls_file); # could just call $self->read_file() inherited from SplitTextFile's parent ReadTextFile
366 #my @lines = split(/(?:\r?\n)+/, $$textref);
367
368 # Open the file in UTF-8 mode https://stackoverflow.com/questions/2220717/perl-read-file-with-encoding-method
369 # and read in line by line into map
370 my $fh;
371 if (open($fh,'<:encoding(UTF-8)', $urls_file)) {
372 while (defined (my $line = <$fh>)) {
373 $line = &util::trim($line); #$line =~ s/^\s+|\s+$//g; # trim whitespace
374 if($line =~ m@^https?://@) { # add only URLs
375 $self->{'keep_urls'}->{$line} = 1; # add the url to our perl hash
376 }
377 }
378 close $fh;
379 } else {
380 my $msg = "NutchTextDumpPlugin ERROR: Unable to open file keep_urls_file: \"" .
381 $self->{'keep_urls_file'} . "\".\n " .
382 " No records will be filtered.\n";
383 print $outhandle $msg;
384 print $failhandle $msg;
385 # Not fatal. TODO: should it be fatal when it can still process all URLs just because
386 # it can't find the specified keep-urls.txt file?
387 }
388
389 # if keep_urls hash is empty, ensure it is undefined from this point onward
390 # https://stackoverflow.com/questions/9444915/how-to-check-if-a-hash-is-empty-in-perl
391 my %urls_map = $self->{'keep_urls'};
392 if(!keys %urls_map) {
393 $self->{'keep_urls'} = undef;
394 }
395
396}
397
398# Accept "dump.txt" files (which are in numeric siteID folders),
399# and txt files with numeric siteID, e.g. "01441.txt"
400# if I preprocessed dump.txt files by renaming them this way.
401sub get_default_process_exp {
402 my $self = shift (@_);
403
404 return q^(?i)((dump|\d+)\.txt)$^;
405}
406
407
408sub get_default_split_exp {
409
410 # prev line is either a new line or start of dump.txt
411 # current line should start with url protocol and contain " key: .... http(s)/"
412 # \r\n for msdos eol, \n for unix
413
414 # The regex return value of this method is passed into a call to perl split.
415 # Perl's split(), by default throws away delimiter
416 # Any capturing group that makes up or is part of the delimiter becomes a separate element returned by split
417 # We want to throw away the empty newlines preceding the first line of a record "https? .... key: https?/"
418 # but we want to keep that first line as part of the upcoming record.
419 # - To keep the first line of a record, though it becomes its own split-element, use capture groups in split regex:
420 # https://stackoverflow.com/questions/14907772/split-but-keep-delimiter
421 # - To skip the unwanted empty lines preceding the first line of a record use ?: in front of its capture group
422 # to discard that group:
423 # https://stackoverflow.com/questions/3512471/what-is-a-non-capturing-group-in-regular-expressions
424 # - Next use a positive look-ahead (?= in front of capture group, vs ?! for negative look ahead)
425 # to match but not capture the first line of a record (so the look-ahead matched is retained as the
426 # first line of the next record):
427 # https://stackoverflow.com/questions/14907772/split-but-keep-delimiter
428 # and http://www.regular-expressions.info/lookaround.html
429 # - For non-greedy match, use .*?
430 # https://stackoverflow.com/questions/11898998/how-can-i-write-a-regex-which-matches-non-greedy
431 return q^(?:$|\r?\n\r?\n)(?=https?://.+?\skey:\s+.*?https?/)^;
432
433}
434
435# TODO: Copied method from MARCPlugin.pm and uncommented return statement when encoding = utf8
436# Move to a utility perl file, since code is mostly shared?
437# The bulk of this function is based on read_line in multiread.pm
438# Unable to use read_line original because it expects to get its input
439# from a file. Here the line to be converted is passed in as a string
440
441# TODO:
442# Is this function even applicable to NutchTextDumpPlugin?
443# I get errors in this method when encoding is utf-8 in the decode step.
444# I get warnings/errors somewhere in this file (maybe also at decode) when encoding is windows-1252.
445
446sub to_utf8
447{
448 my $self = shift (@_);
449 my ($encoding, $line) = @_;
450
451 if ($encoding eq "utf8") {
452 # nothing needs to be done
453 return $line;
454 } elsif ($encoding eq "iso_8859_1" || $encoding eq "windows-1252") { # TODO: do this also for windows-1252?
455 # we'll use ascii2utf8() for this as it's faster than going
456 # through convert2unicode()
457 #return &unicode::ascii2utf8 (\$line);
458 $line = &unicode::ascii2utf8 (\$line);
459 } else {
460
461 # everything else uses unicode::convert2unicode
462 $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
463 }
464 # At this point $line is a binary byte string
465 # => turn it into a Unicode aware string, so full
466 # Unicode aware pattern matching can be used.
467 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
468
469 return decode ("utf8", $line);
470}
471
472
473
474# do plugin specific processing of doc_obj
475# This gets done for each record found by SplitTextFile in marc files.
476sub process {
477 my $self = shift (@_);
478 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
479
480 my $outhandle = $self->{'outhandle'};
481 my $filename = &util::filename_cat($base_dir, $file);
482
483 my $cursection = $doc_obj->get_top_section();
484
485 # https://stackoverflow.com/questions/27801561/turn-off-binmodestdout-utf8-locally
486 #binmode STDERR, ':utf8'; ## FOR DEBUGGING! To avoid "wide character in print" messages
487
488 #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n";
489
490
491 # (1) parse out the metadata of this record
492 my $metaname;
493 my $encoding;
494 my $title_meta;
495
496 my $line_index = 0;
497 my $text_start_index = -1;
498 my @lines = split(/(?:\r?\n)+/, $$textref);
499
500 foreach my $line (@lines) {
501 #$line =~ s@\{@\\{@g; # escape open curly braces for newer perl
502
503 # first line is special and contains the URL (no metaname)
504 # and the inverted URL labelled with metaname "key"
505 if($line =~ m/^https?/ && $line =~ m/\s+key:\s+/) {
506 my @vals = split(/key:/, $line);
507 # get url and key, and trim whitespace simultaneously
508 my $url = &util::trim($vals[0]);
509 my $key = &util::trim($vals[1]);
510
511 # if we have a keep_urls hash, then only process records of whitelisted urls
512 if(defined $self->{'keep_urls'} && !$self->{'keep_urls'}->{$url}) {
513 # URL not whitelisted, so stop processing this record
514 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): discarding record for URL not whitelisted: $url\n"
515 if $self->{'verbosity'} > 3;
516 return 0;
517 } else {
518 print STDERR "@@@@@@ INFO NutchTextDumpPlugin::process(): processing record of whitelisted URL $url...\n"
519 if $self->{'verbosity'} > 3;
520 }
521 $doc_obj->add_utf8_metadata ($cursection, "ex.srcURL", $url);
522 $doc_obj->add_utf8_metadata ($cursection, "ex.key", $key);
523
524
525 # let's also set the domain from the URL, as that will make a
526 # more informative bookshelf label than siteID
527 # For complete domain, keep protocol:// and every non-slash after.
528 # (This avoids requiring presence of subsequent slash)
529 # https://stackoverflow.com/questions/3652527/match-regex-and-assign-results-in-single-line-of-code
530 # Can clean up protocol and www. in List classifier's bookshelf's remove_prefix option
531 # or can build classifier on basicDomain instead.
532
533 my ($domain, $basicDomain) = $url =~ m@(^https?://(?:www\.)?([^/]+)).*@;
534 #my ($domain, $protocol, $basicdomain) = $url =~ m@((^https?)://([^/]+)).*@; # Works
535 $doc_obj->add_utf8_metadata ($cursection, "ex.srcDomain", $domain);
536 $doc_obj->add_utf8_metadata ($cursection, "ex.basicDomain", $basicDomain);
537
538 }
539 # check for full text
540 elsif ($line =~ m/text:start:/) {
541 $text_start_index = $line_index;
542 last; # if we've reached the full text portion, we're past the metadata portion of this record
543 }
544 elsif($line =~ m/^[^:]+:.+$/) { # look for meta #elsif($line =~ m/^[^:]+:[^:]+$/) { # won't allow protocol://url in metavalue
545 my @metakeyvalues = split(/:/, $line); # split on first :
546
547 my $metaname = shift(@metakeyvalues);
548 my $metavalue = join("", @metakeyvalues);
549
550 # skip "metadata _rs_" and "metadata _csh_" as these contain illegible characters for values
551 if($metaname !~ m/metadata\s+_(rs|csh)_/) {
552
553 # trim whitespace
554 $metaname = &util::trim($metaname);
555 $metavalue = &util::trim($metavalue);
556
557 if($metaname eq "title") { # TODO: what to do about "title: null" cases?
558 ##print STDERR "@@@@ Found title: $metavalue\n";
559 #$metaname = "Title"; # will set "title" as "Title" metadata instead
560 # TODO: treat title metadata specially by using character encoding to store correctly?
561
562 # Won't add Title metadata to docObj until after all meta is processed,
563 # when we'll know encoding and can process title meta
564 $title_meta = $metavalue;
565 $metavalue = ""; # will force ex.Title metadata to be added AFTER for loop
566 }
567 elsif($metaname =~ m/CharEncodingForConversion/) { # TODO: or look for "OriginalCharEncoding"?
568 ##print STDERR "@@@@ Found encoding: $metavalue\n";
569 $encoding = $metavalue; # TODO: should we use this to interpret the text and title in the correct encoding and convert to utf-8?
570
571 if($encoding eq "utf-8") {
572 $encoding = "utf8"; # method to_utf8() recognises "utf8" not "utf-8"
573 } else {
574 print STDERR "@@@@@@ WARNING NutchTextDumpPlugin::process(): Record's Nutch-assigned CharEncodingForConversion was not utf-8: $encoding\n";
575 }
576
577
578 }
579
580 # move occurrences of "marker " or "metadata " strings at start of metaname to end
581 #$metaname =~ s/^(marker|metadata)\s+(.*)$/$2$1/;
582 # remove "marker " or "metadata " strings from start of metaname
583 $metaname =~ s/^(marker|metadata)\s+//;
584 # remove underscores and all remaining spaces in metaname
585 $metaname =~ s/[ _]//g;
586
587 # add meta to docObject if both metaname and metavalue are non-empty strings
588 if($metaname ne "" && $metavalue ne "") { # && $metaname ne "rs" && $metaname ne "csh") {
589 $doc_obj->add_utf8_metadata ($cursection, "ex.".$metaname, $metavalue);
590 #print STDERR "Added meta |$metaname| = |$metavalue|\n"; #if $metaname =~ m/ProtocolStatus/i;
591 }
592
593 }
594 } elsif ($line !~ m/^\s*$/) { # Not expecting any other type of non-empty line (or even empty lines)
595 print STDERR "NutchTextDump line not recognised as URL meta, other metadata or text content:\n\t$line\n";
596 }
597
598 $line_index++;
599 }
600
601
602 # Add fileFormat as the metadata
603 $doc_obj->add_metadata($cursection, "FileFormat", "NutchDumpTxt");
604
605 # Correct title metadata using encoding, if we have $encoding at last
606 # $title_meta = $self->to_utf8($encoding, $title_meta) if $encoding;
607 # https://stackoverflow.com/questions/12994100/perl-encode-pm-cannot-decode-string-with-wide-character
608 # Error message: "Perl Encode.pm cannot decode string with wide character"
609 # "That error message is saying that you have passed in a string that has already been decoded
610 # (and contains characters above codepoint 255). You can't decode it again."
611 if($title_meta && $title_meta ne "" && $title_meta ne "null") {
612 #$title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding);
613 } else { # if we have "null" as title metadata, set it to the record URL?
614 #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL");
615 #print STDERR "@@@@ null title to be replaced with ".$srcURLs->[0]."\n";
616 #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0);
617 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param
618 my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; # use basicURL for title instead of srcURL, else many docs get classified under "Htt" bucket for https
619 if(defined $srcURL) {
620 print STDERR "@@@@ null/empty title to be replaced with ".$basicURL."\n"
621 if $self->{'verbosity'} > 3;
622 $title_meta = $basicURL;
623 }
624 }
625 $doc_obj->add_utf8_metadata ($cursection, "Title", $title_meta);
626
627
628 # When importOption OIDtype = dirname, the base_OID will be that dirname
629 # which was crafted to be the siteID. However, because our siteID is all numeric,
630 # a D gets prepended to create baseOID. Remove the starting 'D' to get actual siteID.
631 my $siteID = $self->get_base_OID($doc_obj);
632 #print STDERR "BASE OID: " . $self->get_base_OID($doc_obj) . "\n";
633 $siteID =~ s/^D//;
634 $doc_obj->add_utf8_metadata ($cursection, "ex.siteID", $siteID);
635
636
637 # (2) parse out text of this record
638 # if($text_start_index != -1 && pop(@lines) =~ m/text:end:/) { # we only have text content if there were "text:start:" and "text:end:" markers.
639 # # TODO: are we guaranteed popped line is text:end: and not empty/newline?
640 # @lines = splice(@lines,0,$text_start_index+1); # just keep every line AFTER text:start:, have already removed (popped) "text:end:"
641
642 # # glue together remaining lines, if there are any, into textref
643 # # https://stackoverflow.com/questions/7406807/find-size-of-an-array-in-perl
644 # if(scalar (@lines) > 0) {
645 # # TODO: do anything with $encoding to convert line to utf-8?
646 # foreach my $line (@lines) {
647 # $line = $self->to_utf8($encoding, $line) if $encoding; #if $encoding ne "utf-8";
648 # $$textref .= $line."\n";
649 # }
650 # }
651 # $$textref = "<pre>\n".$$textref."</pre>";
652 # } else {
653 # print STDERR "WARNING: NutchTextDumpPlugin::process: had found a text start marker but not text end marker.\n";
654 # $$textref = "<pre></pre>";
655 # }
656
657 # (2) parse out text of this record
658 my $no_text = 1;
659 if($text_start_index != -1) { # had found a "text:start:" marker, so we should have text content for this record
660 if($$textref =~ m/text:start:\r?\n(.*?)\r?\ntext:end:/) {
661 $$textref = $1;
662 if($$textref !~ m/^\s*$/) {
663 #$$textref = $self->to_utf8($encoding, $$textref) if ($encoding);
664 $$textref = "<pre>\n".$$textref."\n</pre>";
665 $no_text = 0;
666 }
667 }
668 }
669 if($no_text) {
670 $$textref = "<pre></pre>";
671 }
672
673 # Debugging
674 # To avoid "wide character in print" messages for debugging, set binmode of handle to utf8/encoding
675 # https://stackoverflow.com/questions/15210532/use-of-use-utf8-gives-me-wide-character-in-print
676 # if ($self->{'verbosity'} > 3) {
677 # if($encoding && $encoding eq "utf8") {
678 # binmode STDERR, ':utf8';
679 # }
680
681 # print STDERR "TITLE: $title_meta\n";
682 # print STDERR "ENCODING = $encoding\n" if $encoding;
683 # #print STDERR "---------------\nTEXT CONTENT\n---------\n", $$textref, "\n------------------------\n";
684 # }
685
686
687 $doc_obj->add_utf8_text($cursection, $$textref);
688
689 return 1;
690}
691
692
6931;
Note: See TracBrowser for help on using the repository browser.