Changeset 34124
- Timestamp:
- 2020-05-27T18:03:58+12:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm
r34123 r34124 90 90 # Example record in dump.txt to process: 91 91 # https://www.whanau-tahi.school.nz/ key: nz.school.whanau-tahi.www:https/ 92 # OR: http://yutaka.it-n.jp/apa/750010010.html key: jp.it-n.yutaka:http/apa/750010010.html 92 93 # baseUrl: null 93 94 # status: 2 (status_fetched) … … 609 610 # (and contains characters above codepoint 255). You can't decode it again." 610 611 if($title_meta && $title_meta ne "" && $title_meta ne "null") { 611 $title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding);612 #$title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding); 612 613 } else { # if we have "null" as title metadata, set it to the record URL? 613 614 #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL"); … … 615 616 #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0); 616 617 my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param 618 my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; # use basicURL for title instead of srcURL, else many docs get classified under "Htt" bucket for https 617 619 if(defined $srcURL) { 618 print STDERR "@@@@ null/empty title to be replaced with ".$ srcURL."\n"620 print STDERR "@@@@ null/empty title to be replaced with ".$basicURL."\n" 619 621 if $self->{'verbosity'} > 3; 620 $title_meta = $ srcURL;622 $title_meta = $basicURL; 621 623 } 622 624 } … … 659 661 $$textref = $1; 660 662 if($$textref !~ m/^\s*$/) { 661 $$textref = $self->to_utf8($encoding, $$textref) if ($encoding);663 #$$textref = $self->to_utf8($encoding, $$textref) if ($encoding); 662 664 $$textref = "<pre>\n".$$textref."\n</pre>"; 663 665 $no_text = 0;
Note:
See TracChangeset
for help on using the changeset viewer.