Ignore:
Timestamp:
2016-12-27T18:51:42+13:00 (7 years ago)
Author:
davidb
Message:

Rekindling of per-volume approach. Also some tweaking to verbosity debug printing in per-page

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

    r31252 r31266  
    22
    33import java.io.IOException;
     4import java.util.ArrayList;
     5import java.util.Iterator;
    46
     7import org.apache.spark.api.java.function.FlatMapFunction;
    58import org.apache.spark.api.java.function.VoidFunction;
    69import org.apache.spark.util.DoubleAccumulator;
     
    1821
    1922
    20 public class PerVolumeJSON implements VoidFunction<String>
     23//public class PerVolumeJSON implements VoidFunction<String>
     24public class PerVolumeJSON implements FlatMapFunction<String,String>
    2125{
    2226    private static final long serialVersionUID = 1L;
     
    5862    }
    5963   
    60     //public Iterator<String> call(String json_file_in)
    61     public void call(String json_file_in) throws IOException
     64    //public void call(String json_file_in) throws IOException
     65    public Iterator<String> call(String json_file_in) throws IOException
     66   
    6267    {
    6368        if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
     
    6570        }
    6671
     72        ArrayList<String> ids = null;
     73       
    6774        String full_json_file_in = _input_dir + "/" + json_file_in;
    6875        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
     
    94101            }
    95102
    96             //ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
     103            ids = new ArrayList<String>(ef_num_pages);
    97104            for (int i = 0; i < ef_page_count; i++) {
    98105                String formatted_i = String.format("page-%06d", i);
     
    104111
    105112                String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
    106                 //ids.add(output_json_bz2); // ****
     113                ids.add(page_id);
    107114
    108115                if (i==0) {
     
    164171        _progress_accum.add(_progress_step);
    165172       
    166         //return ids.iterator();
     173        return ids.iterator();
    167174    }
    168175}
Note: See TracChangeset for help on using the changeset viewer.