Context Navigation

← Previous Change
Next Change →

wcsa

Timestamp:

2017-01-31T00:06:39+13:00 (7 years ago)

Author:

davidb

Message:

Reworked to use sequenceFiles

Location:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures

Files:

: 2 edited

PerVolumeJSON.java (modified) (9 diffs)
ProcessForSolrIngest.java (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

-              r31278
+              r31372
 import java.util.Iterator;
+import org.apache.hadoop.io.Text;
 import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.util.DoubleAccumulator;
 …
 //public class PerVolumeJSON implements VoidFunction<String>
 public class PerVolumeJSON implements FlatMapFunction<String,String>
+public class PerVolumeJSON implements Function<Text,Integer>
+{
     private static final long serialVersionUID = 1L;
 …
     protected WhitelistBloomFilter _whitelist_bloomfilter;
+    protected DoubleAccumulator _progress_accum;
+    protected double            _progress_step;
      boolean _icu_tokenize;
 …
     public PerVolumeJSON(String input_dir, String whitelist_filename,
                          String solr_url, String output_dir, int verbosity,
-                         DoubleAccumulator progress_accum, double progress_step,
                          boolean icu_tokenize, boolean strict_file_io)
+    {
 …
         _verbosity  = verbosity;
-        _progress_accum = progress_accum;
-        _progress_step  = progress_step;
         _icu_tokenize   = icu_tokenize;
         _strict_file_io = strict_file_io;
 …
+    }
     //public void call(String json_file_in) throws IOException
     public Iterator<String> call(String json_file_in) throws IOException
+    public Integer call(Text json_text) throws IOException
+    {
         if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
 …
+        }
+        int ef_num_pages = 0;
+        try {
+            JSONObject extracted_feature_record  = new JSONObject(json_text.toString());
+            if (extracted_feature_record != null) {
+                String volume_id = extracted_feature_record.getString("id");
+                //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
+                //String title= ef_metadata.getString("title");
+                JSONObject ef_features = extracted_feature_record.getJSONObject("features");
+                int ef_page_count = ef_features.getInt("pageCount");
+                if (_verbosity >= 1) {
+                    System.out.println("Processing: " + volume_id);
+                    System.out.println("  pageCount = " + ef_page_count);
+                }
+                JSONArray ef_pages = ef_features.getJSONArray("pages");
+                ef_num_pages = ef_pages.length();
+                for (int i = 0; i < ef_page_count; i++) {
+                    String formatted_i = String.format("page-%06d", i);
+                    String page_id = volume_id + "." + formatted_i;
+                    if (_verbosity >= 2) {
+                        System.out.println("  Page: " + page_id);
+                    }
+                    JSONObject ef_page = ef_pages.getJSONObject(i);
+                    if (ef_page != null) {
+                        // Convert to Solr add form
+                        JSONObject solr_add_doc_json
+                        = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
+                        if ((_verbosity >=2) && (i==20)) {
+                            System.out.println("==================");
+                            System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
+                            System.out.println("==================");
+                        }
+                        if (_solr_url != null) {
+                            if ((_verbosity >=2) && (i==20)) {
+                                System.out.println("==================");
+                                System.out.println("Posting to: " + _solr_url);
+                                System.out.println("==================");
+                            }
+                            SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
+                        }
+                    }
+                    else {
+                        System.err.println("Skipping: " + page_id);
+                    }
+                }
+            }
+        }
+        catch (Exception e) {
+            if (_strict_file_io) {
+                throw e;
+            }
+            else {
+                e.printStackTrace();
+            }
+        }
+        return ef_num_pages;
+    }
+        /*
+    //public void call(String json_file_in) throws IOException
+    public Integer call(String json_file_in) throws IOException
+    {
+        if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
+            _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
+        }
+        int ef_num_pages = 0;
         ArrayList<String> ids = new ArrayList<String>(); // want it to be non-null so can return valid iterator
 …
             JSONArray ef_pages = ef_features.getJSONArray("pages");
             int ef_num_pages = ef_pages.length();
+            ef_num_pages = ef_pages.length();
             // Make directory for page-level JSON output
 …
+        }
+        //ids.add(volume_id);
+        _progress_accum.add(_progress_step);
+        return ids.iterator();
+        return ef_num_pages;
+    }
+    */
+}

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

-              r31277
+              r31372
 import org.apache.commons.cli.*;
+import org.apache.hadoop.io.Text;
 import org.apache.spark.api.java.*;
 import org.apache.spark.util.DoubleAccumulator;
 …
+    }
+    public void execPerVolumeSequenceFile()
+    {
+        String spark_app_name = generateSparkAppName("Per Volume");
+        SparkConf conf = new SparkConf().setAppName(spark_app_name);
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        jsc.hadoopConfiguration().set("io.compression.codec.bzip2.library", "java-builtin");
+        //String packed_sequence_path = "hdfs:///user/capitanu/data/packed-ef";
+        String packed_sequence_path = _json_list_filename;
+        JavaPairRDD<Text, Text> input_pair_rdd = jsc.sequenceFile(packed_sequence_path, Text.class, Text.class);
+        JavaRDD<Text> json_text_rdd = input_pair_rdd.map(item -> item._2);
+        boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
+        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
+        PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename,
+                                                       _solr_url,_output_dir,_verbosity,
+                                                       icu_tokenize,strict_file_io);
+        JavaRDD<Integer> per_volume_page_count = json_text_rdd.map(per_vol_json);
+        Integer num_page_ids = per_volume_page_count.reduce((a, b) -> a + b);
+        System.out.println("");
+        System.out.println("############");
+        System.out.println("# Number of page ids: " + num_page_ids);
+        System.out.println("############");
+        System.out.println("");
+        jsc.close();
+    }
+    /*
     public void execPerVolume()
+    {
 …
         jsc.close();
+    }
+    */
+    /*
     public void execPerPage()
+    {
 …
         JavaSparkContext jsc = new JavaSparkContext(conf);
+        /*
+        if (_verbosity >= 2) {
+            System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
+            System.out.println("Default Parallelism: " + jsc.defaultParallelism());
+        }
+            */
         //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
 …
         JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map);
+/*
+        System.out.println("");
+        System.out.println("############");
+        System.out.println("# Progress Accumulator: " + progress_accum.value());
+        System.out.println("############");
+        System.out.println("");
+        long num_page_ids = per_page_ids.count(); // trigger lazy eval of: flatmap:per-vol -> map:per-page
+        System.out.println("");
+        System.out.println("############");
+        System.out.println("# Number of page ids: " + num_page_ids);
+        System.out.println("############");
+        System.out.println("");
+        //if (_output_dir != null) {
+            //String rdd_save_file = "rdd-solr-json-page-files";
+            //json_ids.saveAsTextFile(rdd_save_file);
+            //System.out.println("############");
+            //System.out.println("# Saved RDD of Solr JSON page files, top-level, as:");
+            //System.out.println("#  " + rdd_save_file);
+            //System.out.println("############");
+            //System.out.println("");
+        //}
+        jsc.close();
+    }
 */
-        long num_page_ids = per_page_ids.count(); // trigger lazy eval of: flatmap:per-vol -> map:per-page
-        System.out.println("");
-        System.out.println("############");
-        System.out.println("# Number of page ids: " + num_page_ids);
-        System.out.println("############");
-        System.out.println("");
-        /*
-        if (_output_dir != null) {
-            String rdd_save_file = "rdd-solr-json-page-files";
-            json_ids.saveAsTextFile(rdd_save_file);
-            System.out.println("############");
-            System.out.println("# Saved RDD of Solr JSON page files, top-level, as:");
-            System.out.println("#  " + rdd_save_file);
-            System.out.println("############");
-            System.out.println("");
+        }
-        */
-        jsc.close();
+    }
 …
             = new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity);
+        prep_for_ingest.execPerVolumeSequenceFile();
+        /*
         String process_ef_json_mode = System.getProperty("wcsa-ef-ingest.process-ef-json-mode","per-page");
         if (process_ef_json_mode.equals("per-volume")) {
 …
         else {
             prep_for_ingest.execPerPage();
+        }
+        }*/
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 31372 for other-projects/hathitrust/wcsa

Legend:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

Download in other formats: