Ignore:
Timestamp:
2016-12-12T20:18:04+13:00 (7 years ago)
Author:
davidb
Message:

Use of whitelist Bloom filter added to words going into Solr index

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java

    r31096 r31220  
    2525   
    2626    protected String _input_dir;
     27   
     28    protected WhitelistBloomFilter _whitelist_bloomfilter;
     29
    2730    protected String _solr_url;
    2831    protected String _output_dir;
     
    3437    boolean _strict_file_io;
    3538   
    36     public PerPageJSONFlatmap(String input_dir, String solr_url, String output_dir, int verbosity,
     39    public PerPageJSONFlatmap(String input_dir, String whitelist_filename,
     40                              String solr_url, String output_dir, int verbosity,
    3741                              DoubleAccumulator progress_accum, double progress_step,
    3842                              boolean strict_file_io)
    3943    {
    4044        _input_dir  = input_dir;
     45       
     46        if (whitelist_filename != null) {
     47            _whitelist_bloomfilter = new WhitelistBloomFilter(whitelist_filename,true);
     48        }
     49       
    4150        _solr_url   = solr_url;
    4251        _output_dir = output_dir;
     
    5160   
    5261    public Iterator<JSONObject> call(String json_file_in) throws IOException
    53     //public void call(String json_file_in)
    5462    {
    55             //ClusterFileIO.memory_usage("Before BZIP2 JSON file read");
    56         String full_json_file_in = _input_dir + "/" + json_file_in;
     63        String full_json_file_in = _input_dir + "/" + json_file_in;
    5764        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
    58         //ClusterFileIO.memory_usage("After BZIP2 JSON file read");
    59 
     65   
    6066        ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>();
    6167       
Note: See TracChangeset for help on using the changeset viewer.