Ignore:
Timestamp:
2016-12-20T14:15:05+13:00 (7 years ago)
Author:
davidb
Message:

Support for icu-tokenize property added, plus relevant refactoring.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java

    r31242 r31252  
    2020    protected double            _progress_step;
    2121   
     22    boolean _icu_tokenize;
    2223    boolean _strict_file_io;
    2324   
    2425    public PerVolumeWordStreamFlatmap(String input_dir, int verbosity,
    2526                              DoubleAccumulator progress_accum, double progress_step,
     27                              boolean icu_tokenize,
    2628                              boolean strict_file_io)
    2729    {
     
    3234        _progress_step  = progress_step;
    3335       
     36        _icu_tokenize   = icu_tokenize;
    3437        _strict_file_io = strict_file_io;
    3538    }
     
    8790                if (ef_page != null) {
    8891                   
    89                     ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page);                 
     92                    ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page, _icu_tokenize);                   
    9093                    all_word_list.addAll(page_word_list);
    9194                }
Note: See TracChangeset for help on using the changeset viewer.