Changeset 34124


Ignore:
Timestamp:
2020-05-27T18:03:58+12:00 (4 years ago)
Author:
ak19
Message:

Decoding the title and text using the encoding seemed to have turned into a problem, both for windows 1252 but also <td valign=top>

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm

    r34123 r34124  
    9090# Example record in dump.txt to process:
    9191    # https://www.whanau-tahi.school.nz/    key:    nz.school.whanau-tahi.www:https/
     92    # OR: http://yutaka.it-n.jp/apa/750010010.html        key:    jp.it-n.yutaka:http/apa/750010010.html
    9293    # baseUrl:  null
    9394    # status:   2 (status_fetched)
     
    609610    # (and contains characters above codepoint 255). You can't decode it again."
    610611    if($title_meta && $title_meta ne "" && $title_meta ne "null") {
    611     $title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding);
     612    #$title_meta = $self->to_utf8($encoding, $title_meta) if ($encoding);
    612613    } else { # if we have "null" as title metadata, set it to the record URL?
    613614    #my $srcURLs = $doc_obj->get_metadata($cursection, "ex.srcURL");
     
    615616    #$title_meta = $srcURLs->[0] if (scalar @$srcURLs > 0);
    616617    my $srcURL = $doc_obj->get_metadata_element($cursection, "srcURL", 1); # TODO: why does ex.srcURL not work, nor srcURL without 3rd param
     618    my ($basicURL) = $srcURL =~ m@^https?://(?:www\.)?(.*)$@; # use basicURL for title instead of srcURL, else many docs get classified under "Htt" bucket for https
    617619    if(defined $srcURL) {
    618         print STDERR "@@@@ null/empty title to be replaced with ".$srcURL."\n"
     620        print STDERR "@@@@ null/empty title to be replaced with ".$basicURL."\n"
    619621        if $self->{'verbosity'} > 3;
    620         $title_meta = $srcURL;
     622        $title_meta = $basicURL;
    621623    }
    622624    }
     
    659661        $$textref = $1;
    660662        if($$textref !~ m/^\s*$/) {
    661         $$textref = $self->to_utf8($encoding, $$textref) if ($encoding);
     663        #$$textref = $self->to_utf8($encoding, $$textref) if ($encoding);
    662664        $$textref = "<pre>\n".$$textref."\n</pre>";
    663665        $no_text = 0;
Note: See TracChangeset for help on using the changeset viewer.