Context Navigation

← Previous Change
Next Change →

Changeset 31045 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest

Timestamp:

2016-11-02T21:34:47+13:00 (7 years ago)

Author:

davidb

Message:

More careful treatment of what to do when a JSON file isn't there

Location:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures

Files:

: 4 edited

JSONClusterFileIO.java (modified) (2 diffs)
PerPageJSONFlatmap.java (modified) (3 diffs)
PerPageJSONMap.java (modified) (2 diffs)
ProcessForSolrIngest.java (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/JSONClusterFileIO.java

-              r30996
+              r31045
     protected static JSONObject readJSONFile(String filename)
+    {
         StringBuilder sb = new StringBuilder();
+        JSONObject json_obj = null;
         try {
+            StringBuilder sb = new StringBuilder();
             String str;
             BufferedReader br = ClusterFileIO.getBufferedReaderForCompressedFile(filename);
 …
             br.close();
+            json_obj = new JSONObject(sb.toString());
+        }
         catch (Exception e) {
             e.printStackTrace();
+        }
-        JSONObject json_obj = new JSONObject(sb.toString());
         return json_obj;

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java

-              r31030
+              r31045
 package org.hathitrust.extractedfeatures;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 …
     protected double            _progress_step;
+    boolean _strict_file_io;
     public PerPageJSONFlatmap(String input_dir, String solr_url, String output_dir, int verbosity,
+                     DoubleAccumulator progress_accum, double progress_step)
+                              DoubleAccumulator progress_accum, double progress_step,
+                              boolean strict_file_io)
+    {
         _input_dir  = input_dir;
 …
         _progress_accum = progress_accum;
         _progress_step  = progress_step;
+        _strict_file_io = strict_file_io;
+    }
     public Iterator<JSONObject> call(String json_file_in)
+    public Iterator<JSONObject> call(String json_file_in) throws IOException
     //public void call(String json_file_in)
+    {
+        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(_input_dir + "/" + json_file_in);
+        String full_json_file_in = _input_dir + "/" + json_file_in;
+        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
         String volume_id = extracted_feature_record.getString("id");
+        ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>();
+        //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
+        //String title= ef_metadata.getString("title");
+        JSONObject ef_features = extracted_feature_record.getJSONObject("features");
+        int ef_page_count = ef_features.getInt("pageCount");
+        if (_verbosity >= 1) {
+            System.out.println("Processing: " + json_file_in);
+            System.out.println("  pageCount = " + ef_page_count);
+        if (extracted_feature_record != null) {
+            String volume_id = extracted_feature_record.getString("id");
+            //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
+            //String title= ef_metadata.getString("title");
+            JSONObject ef_features = extracted_feature_record.getJSONObject("features");
+            int ef_page_count = ef_features.getInt("pageCount");
+            if (_verbosity >= 1) {
+                System.out.println("Processing: " + json_file_in);
+                System.out.println("  pageCount = " + ef_page_count);
+            }
+            JSONArray ef_pages = ef_features.getJSONArray("pages");
+            int ef_num_pages = ef_pages.length();
+            if (ef_num_pages != ef_page_count) {
+                System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
+                        +" does not match 'pageCount' metadata (" + ef_page_count + ")");
+            }
+            // Make directory for page-level JSON output
+            String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
+            String page_json_dir = json_dir + "/pages";
+            ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
+            if (_verbosity >= 2) {
+                System.out.print("  Pages: ");
+            }
+            for (int i = 0; i < ef_page_count; i++) {
+                String formatted_i = String.format("page-%06d", i);
+                String page_id = volume_id + "." + formatted_i;
+                if (_verbosity >= 2) {
+                    if (i>0) {
+                        System.out.print(", ");
+                    }
+                    System.out.print(page_id);
+                }
+                String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
+                if (i==(ef_page_count-1)) {
+                    if (_verbosity >= 2) {
+                        System.out.println();
+                    }
+                    System.out.println("Sample output JSON page file: " + output_json_bz2);
+                }
+                JSONObject ef_page = ef_pages.getJSONObject(i);
+                if (ef_page != null) {
+                    // Convert to Solr add form
+                    JSONObject solr_add_doc_json = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page);
+                    solr_add_doc_json.put("filename_json_bz2", output_json_bz2);
+                    json_pages.add(solr_add_doc_json);
+                }
+                else {
+                    System.err.println("Skipping: " + page_id);
+                }
+            }
+        }
+        JSONArray ef_pages = ef_features.getJSONArray("pages");
+        int ef_num_pages = ef_pages.length();
+        // Make directory for page-level JSON output
+        String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
+        String page_json_dir = json_dir + "/pages";
+        ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
+        if (_verbosity >= 2) {
+              System.out.print("  Pages: ");
+            }
+        ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>(ef_num_pages);
+        for (int i = 0; i < ef_page_count; i++) {
+            String formatted_i = String.format("page-%06d", i);
+            String page_id = volume_id + "." + formatted_i;
+            if (_verbosity >= 2) {
+              if (i>0) {
+                  System.out.print(", ");
+              }
+              System.out.print(page_id);
+            }
+            String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
+            //ids.add(output_json_bz2);
+            if (i==(ef_page_count-1)) {
+                if (_verbosity >= 2) {
+                    System.out.println();
+                }
+                System.out.println("Sample output JSON page file: " + output_json_bz2);
+            }
+            JSONObject ef_page = ef_pages.getJSONObject(i);
+            if (ef_page != null) {
+                // Convert to Solr add form
+                JSONObject solr_add_doc_json = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page);
+                solr_add_doc_json.put("filename_json_bz2", output_json_bz2);
+                json_pages.add(solr_add_doc_json);
+        else {
+            // File did not exist, or could not be parsed
+            String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
+            if (_strict_file_io) {
+                throw new IOException(mess);
+            }
             else {
+                System.err.println("Skipping: " + page_id);
+                System.err.println("Warning: " + mess);
+                System.out.println("Warning: " + mess);
+            }
+        }
+        //ids.add(volume_id);
         _progress_accum.add(_progress_step);
-        //return ids.iterator();
         return json_pages.iterator();
+    }

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONMap.java

-              r31028
+              r31045
     protected long            _progress_step;
     public PerPageJSONMap(String input_dir, ArrayList<String> solr_endpoints, String output_dir, int verbosity,
                           LongAccumulator progress_accum, long progress_step)
 …
         _progress_accum = progress_accum;
         _progress_step  = progress_step;
+    }

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

-              r31041
+              r31045
+    }
-    public void execPerVolume()
+    {
-        String spark_app_name = generateSparkAppName("Per Volume");
-        SparkConf conf = new SparkConf().setAppName(spark_app_name);
-        JavaSparkContext jsc = new JavaSparkContext(conf);
-        if (_verbosity >= 2) {
-            System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
-            System.out.println("Default Parallelism: " + jsc.defaultParallelism());
+        }
-        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
-        long num_volumes = json_list_data.count();
-        double per_vol = 100.0/(double)num_volumes;
-        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
-        System.err.println();
-        System.err.println();
-        System.err.println();
-        System.err.println("****##### _input_dir =  " + _input_dir);
-        System.err.println();
-        System.err.println();
-        System.err.println();
-        PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
-        json_list_data.foreach(per_vol_json);
-        long num_ids = num_volumes;
-        System.out.println("");
-        System.out.println("############");
-        System.out.println("# Number of volume ids: " + num_ids);
-        System.out.println("############");
-        System.out.println("");
-        jsc.close();
+    }
     public ArrayList<String> extrapolateSolrEndpoints()
+    {
 …
+    }
+    public void execPerVolume()
+    {
+        String spark_app_name = generateSparkAppName("Per Volume");
+        SparkConf conf = new SparkConf().setAppName(spark_app_name);
+        JavaSparkContext jsc = new JavaSparkContext(conf);
+        if (_verbosity >= 2) {
+            System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
+            System.out.println("Default Parallelism: " + jsc.defaultParallelism());
+        }
+        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
+        long num_volumes = json_list_data.count();
+        double per_vol = 100.0/(double)num_volumes;
+        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
+        System.err.println();
+        System.err.println();
+        System.err.println();
+        System.err.println("****##### _input_dir =  " + _input_dir);
+        System.err.println();
+        System.err.println();
+        System.err.println();
+        PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
+        json_list_data.foreach(per_vol_json);
+        long num_ids = num_volumes;
+        System.out.println("");
+        System.out.println("############");
+        System.out.println("# Number of volume ids: " + num_ids);
+        System.out.println("############");
+        System.out.println("");
+        jsc.close();
+    }
     public void execPerPage()
+    {
 …
         DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent");
+        //String strict_file_io_str = System.getProperty("wcsa-ef-ingest.strict-file-io","true");
+        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
         PerPageJSONFlatmap paged_solr_json_flatmap
+            = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, per_vol_progress_accum,per_vol);
+            = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity,
+                                     per_vol_progress_accum,per_vol,
+                                     strict_file_io);
         JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache();
 …
         ArrayList<String> solr_endpoints = extrapolateSolrEndpoints();
         PerPageJSONMap paged_json_id_map
+            = new PerPageJSONMap(_input_dir,solr_endpoints,_output_dir,_verbosity, per_page_progress_accum,1);
+            = new PerPageJSONMap(_input_dir,solr_endpoints,_output_dir,_verbosity,
+                                 per_page_progress_accum,1);
         JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map);

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: