Changeset 34123
- Timestamp:
- 2020-05-26T02:18:44+12:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/NutchTextDumpPlugin.pm
r34122 r34123 31 31 # 32 32 # For a commoncrawl collection of siteID-labelled folders containing dump.txt files each, 33 # set <importOption name="OIDtype" value="dirname"/>34 # And create 2 List browsing classifiers (with bookshelf_type set to always) on ex.siteID and ex.srcDomain33 # - set <importOption name="OIDtype" value="dirname"/> 34 # - Create 2 List browsing classifiers (with bookshelf_type set to always) on ex.siteID and ex.srcDomain 35 35 # both sorted by ex.srcURL, and an ex.Title classifier. 36 36 # For the ex.srcDomain classifier, set removeprefix to: https?\:\/\/(www\.)? 37 37 # An alternative is to build that List classifier on ex.basicDomain instead of ex.srcDomain. 38 # Set this List classifier's "partition_type_within_level" option to "per_letter". 39 # - Add search indexes on text (default), Title, basicDomain, siteID, Identifier, srcURL (not working) 40 # 38 41 # Finally, in the "display" format statement, add the following before the "wrappedSectionText" to 39 42 # display the most relevant metadata of each record: … … 50 53 # <dd> 51 54 # <gsf:metadata name="ex.Title"/> 55 # </dd> 56 # <dt>Identifier:</dt> 57 # <dd> 58 # <gsf:metadata name="Identifier"/> 52 59 # </dd> 53 60 # <dt>SiteID:</dt> … … 475 482 my $cursection = $doc_obj->get_top_section(); 476 483 484 # https://stackoverflow.com/questions/27801561/turn-off-binmodestdout-utf8-locally 485 #binmode STDERR, ':utf8'; ## FOR DEBUGGING! To avoid "wide character in print" messages 477 486 478 487 #print STDERR "---------------\nDUMP.TXT\n---------\n", $$textref, "\n------------------------\n"; … … 489 498 490 499 foreach my $line (@lines) { 500 #$line =~ s@\{@\\{@g; # escape open curly braces for newer perl 501 491 502 # first line is special and contains the URL (no metaname) 492 503 # and the inverted URL labelled with metaname "key"
Note:
See TracChangeset
for help on using the changeset viewer.