Context Navigation

← Previous Change
Next Change →

trunk

Timestamp:

2016-12-27T18:51:42+13:00 (7 years ago)

Author:

davidb

Message:

Rekindling of per-volume approach. Also some tweaking to verbosity debug printing in per-page

Location:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures

Files:

: 3 edited

PerPageJSONFlatmap.java (modified) (3 diffs)
PerVolumeJSON.java (modified) (7 diffs)
ProcessForSolrIngest.java (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java

-              r31252
+              r31266
                 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
+            }
             if (_verbosity >= 2) {
+            if (_verbosity >= 3) {
                 System.out.print("  Pages: ");
+            }
 …
                 String page_id = volume_id + "." + formatted_i;
                 if (_verbosity >= 2) {
+                if (_verbosity >= 3) {
                     if (i>0) {
                         System.out.print(", ");
 …
                 if (i==(ef_page_count-1)) {
                     if (_verbosity >= 2) {
+                    if (_verbosity >= 3) {
                         System.out.println();
+                    }
+                    System.out.println("Sample output JSON page file: " + output_json_bz2);
+                    if (_verbosity >= 2) {
+                        System.out.println("Sample output JSON page file: " + output_json_bz2);
+                    }
+                }

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

-              r31252
+              r31266
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.util.DoubleAccumulator;
 …
+public class PerVolumeJSON implements VoidFunction<String>
+//public class PerVolumeJSON implements VoidFunction<String>
+public class PerVolumeJSON implements FlatMapFunction<String,String>
+{
     private static final long serialVersionUID = 1L;
 …
+    }
+    //public Iterator<String> call(String json_file_in)
+    public void call(String json_file_in) throws IOException
+    //public void call(String json_file_in) throws IOException
+    public Iterator<String> call(String json_file_in) throws IOException
+    {
         if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
 …
+        }
+        ArrayList<String> ids = null;
         String full_json_file_in = _input_dir + "/" + json_file_in;
         JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
 …
+            }
             //ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
+            ids = new ArrayList<String>(ef_num_pages);
             for (int i = 0; i < ef_page_count; i++) {
                 String formatted_i = String.format("page-%06d", i);
 …
                 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
                 //ids.add(output_json_bz2); // ****
+                ids.add(page_id);
                 if (i==0) {
 …
         _progress_accum.add(_progress_step);
         //return ids.iterator();
+        return ids.iterator();
+    }
+}

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

-              r31252
+              r31266
     //   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
+    protected static final int DEFAULT_NUM_CORES = 6;
+    protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;
+    //protected static final int DEFAULT_NUM_CORES = 6;
+    //protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;
+    protected static final int DEFAULT_FILES_PER_PARTITION = 3000;
     protected String _input_dir;
 …
         SparkConf conf = new SparkConf().setAppName(spark_app_name);
         JavaSparkContext jsc = new JavaSparkContext(conf);
+        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
+        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION);
+        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();
+        long num_volumes = json_list_data.count();
+        double per_vol = 100.0/(double)num_volumes;
+        int num_partitions = (int)(num_volumes/files_per_partition)+1;
+        JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions);
+        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
+        System.err.println();
+        System.err.println();
+        System.err.println();
+        System.err.println("****##### _input_dir =  " + _input_dir);
+        System.err.println();
+        System.err.println();
+        System.err.println();
+        boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
+        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
+        PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename,
+                                                       _solr_url,_output_dir,_verbosity, progress_accum,per_vol,
+                                                       icu_tokenize,strict_file_io);
+        //json_list_data_rp.foreach(per_vol_json);
+        JavaRDD<String> per_page_ids = json_list_data_rp.flatMap(per_vol_json);
+        long num_page_ids = per_page_ids.count();
+        long num_ids = num_volumes;
+        System.out.println("");
+        System.out.println("############");
+        System.out.println("# Number of page ids: " + num_page_ids);
+        System.out.println("############");
+        System.out.println("");
+        jsc.close();
+    }
+    public void execPerPage()
+    {
+        String spark_app_name = generateSparkAppName("Per Page");
+        SparkConf conf = new SparkConf().setAppName(spark_app_name);
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        /*
         if (_verbosity >= 2) {
             System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
             System.out.println("Default Parallelism: " + jsc.defaultParallelism());
+        }
+        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
+        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache();
+            */
+        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
+        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION);
+        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();
         long num_volumes = json_list_data.count();
         double per_vol = 100.0/(double)num_volumes;
+        //JavaRDD<String> json_list_data_rp = json_list_data.repartition((int)(num_volumes/100));
+        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
+        System.err.println();
+        System.err.println();
+        System.err.println();
+        System.err.println("****##### _input_dir =  " + _input_dir);
+        System.err.println();
+        System.err.println();
+        System.err.println();
+        boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
+        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
+        PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename,
+                                                       _solr_url,_output_dir,_verbosity, progress_accum,per_vol,
+                                                       icu_tokenize,strict_file_io);
+        json_list_data.foreach(per_vol_json);
+        long num_ids = num_volumes;
+        System.out.println("");
+        System.out.println("############");
+        System.out.println("# Number of volume ids: " + num_ids);
+        System.out.println("############");
+        System.out.println("");
+        jsc.close();
+    }
+    public void execPerPage()
+    {
+        String spark_app_name = generateSparkAppName("Per Page");
+        SparkConf conf = new SparkConf().setAppName(spark_app_name);
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        if (_verbosity >= 2) {
+            System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
+            System.out.println("Default Parallelism: " + jsc.defaultParallelism());
+        }
+        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
+        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache();
+        long num_volumes = json_list_data.count();
+        double per_vol = 100.0/(double)num_volumes;
+        //JavaRDD<String> json_list_data_rp = json_list_data.repartition((int)(num_volumes/100));
+        int num_partitions = (int)(num_volumes/files_per_partition)+1;
+        JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions);
         DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent");
 …
                                      per_vol_progress_accum,per_vol,
                                      icu_tokenize,strict_file_io);
+        JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache();
+        //JavaRDD<JSONObject> per_page_jsonobjects = json_list_data_rp.flatMap(paged_solr_json_flatmap).cache();
+        JavaRDD<JSONObject> per_page_jsonobjects = json_list_data_rp.flatMap(paged_solr_json_flatmap);
         //long num_page_ids = per_page_jsonobjects.count(); // trigger lazy eval of: flatmap:per-vol

Note: See TracChangeset for help on using the changeset viewer.