Ignore:
Timestamp:
2016-12-12T20:18:04+13:00 (7 years ago)
Author:
davidb
Message:

Use of whitelist Bloom filter added to words going into Solr index

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31201 r31220  
    3333    protected String _input_dir;
    3434    protected String _json_list_filename;
     35    protected String _whitelist_filename;
    3536    protected String _solr_url;
    3637    protected String _output_dir;
     
    4445        _json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;
    4546
     47        boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
     48        _whitelist_filename = (use_whitelist) ?  System.getProperty("wcsa-ef-ingest.whitelist-filename") : null;
     49       
    4650        _solr_url   = solr_url;
    4751        _output_dir = output_dir;
     
    117121        System.err.println();
    118122       
    119         PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
     123        PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename,
     124                                                       _solr_url,_output_dir,_verbosity, progress_accum,per_vol);
    120125
    121126        json_list_data.foreach(per_vol_json);
     
    160165               
    161166        PerPageJSONFlatmap paged_solr_json_flatmap
    162             = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity,
     167            = new PerPageJSONFlatmap(_input_dir,_whitelist_filename,
     168                                     _solr_url,_output_dir,_verbosity,
    163169                                     per_vol_progress_accum,per_vol,
    164170                                     strict_file_io);
     
    309315        ProcessForSolrIngest prep_for_ingest
    310316            = new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity);
    311        
    312        
    313         boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
    314                
    315         if (use_whitelist) {
    316             String whitelist_filename = System.getProperty("wcsa-ef-ingest.whitelist-filename");
    317 
    318             WhitelistBloomFilter whitelist_bloomfilter = new WhitelistBloomFilter(whitelist_filename,true);
    319             whitelist_bloomfilter.contains("foo");
    320         }
    321        
    322        
     317           
    323318        String process_ef_json_mode = System.getProperty("wcsa-ef-ingest.process-ef-json-mode","per-page");
    324319        if (process_ef_json_mode.equals("per-volume")) {
Note: See TracChangeset for help on using the changeset viewer.