Changeset 34123


Ignore:
Timestamp:
2020-05-26T02:18:44+12:00 (4 years ago)
Author:
ak19
Message:

Some more minor changes

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm

    r34122 r34123  
    3131#
    3232# For a commoncrawl collection of siteID-labelled folders containing dump.txt files each,
    33 # set <importOption name="OIDtype" value="dirname"/>
    34 # And create 2 List browsing classifiers (with bookshelf_type set to always) on ex.siteID and ex.srcDomain
     33# - set <importOption name="OIDtype" value="dirname"/>
     34# - Create 2 List browsing classifiers (with bookshelf_type set to always) on ex.siteID and ex.srcDomain
    3535# both sorted by ex.srcURL, and an ex.Title classifier.
    3636# For the ex.srcDomain classifier, set removeprefix to: https?\:\/\/(www\.)?
    3737# An alternative is to build that List classifier on ex.basicDomain instead of ex.srcDomain.
     38# Set this List classifier's "partition_type_within_level" option to "per_letter".
     39# - Add search indexes on text (default), Title, basicDomain, siteID, Identifier, srcURL (not working)
     40#
    3841# Finally, in the "display" format statement, add the following before the "wrappedSectionText" to
    3942# display the most relevant metadata of each record:
     
    5053  #       <dd>
    5154  #         <gsf:metadata name="ex.Title"/>
     55  #       </dd>
     56  #   <dt>Identifier:</dt>
     57  #       <dd>
     58  #         <gsf:metadata name="Identifier"/>
    5259  #       </dd>
    5360  #       <dt>SiteID:</dt>
     
    475482    my $cursection = $doc_obj->get_top_section();
    476483   
     484    # https://stackoverflow.com/questions/27801561/turn-off-binmodestdout-utf8-locally
     485    #binmode STDERR, ':utf8'; ## FOR DEBUGGING! To avoid "wide character in print" messages
    477486   
    478487    #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n";
     
    489498   
    490499    foreach my $line (@lines) {
     500    #$line =~ s@\{@\\{@g; # escape open curly braces for newer perl
     501   
    491502    # first line is special and contains the URL (no metaname)
    492503    # and the inverted URL labelled with metaname "key"
Note: See TracChangeset for help on using the changeset viewer.