Context Navigation

← Previous Changeset
Next Changeset →

Changeset 30985

Timestamp:

2016-10-29T16:17:22+13:00 (8 years ago)

Author:

davidb

Message:

Changed to run main processing method as action rather than transform. Done to help accumulator add

Location:

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust

Files:

: 2 edited

PagedJSON.java (modified) (6 diffs)
PrepareForIngest.java (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java

-              r30984
+              r30985
 import org.apache.commons.compress.compressors.CompressorException;
 import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.util.DoubleAccumulator;
 import org.json.JSONArray;
 …
+class PagedJSON implements FlatMapFunction<String, String>
+//class PagedJSON implements FlatMapFunction<String, String>
+class PagedJSON implements VoidFunction<String>
+{
     private static final long serialVersionUID = 1L;
 …
             String decodedString;
             while ((decodedString = in.readLine()) != null) {
-                //System.out.println(decodedString);
                 sb.append(decodedString);
+            }
 …
+    }
+    public Iterator<String> call(String json_file_in)
+    //public Iterator<String> call(String json_file_in)
+    public void call(String json_file_in)
+    {
         JSONObject extracted_feature_record = readJSONFile(json_file_in);
 …
                     System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
                     System.out.println("==================");
-                    //System.out.println("Sample text [page 20]: " + solr_add_doc_json.getString("_text_"));
+                }
-                // create JSON obj of just the page (for now), and write it out
-                // write out the JSONOBject as a bz2 compressed file
-                /*
-                try {
-                    BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(_output_dir + "/" + output_json_bz2);
-                    bw.write(ef_page.toString());
-                    bw.close();
-                } catch (IOException e) {
-                    e.printStackTrace();
-                } catch (CompressorException e) {
-                    e.printStackTrace();
+                }
-                */
                 if (_solr_url != null) {
 …
+        }
-        /*
-        for (int i = 0; i < ef_num_pages; i++)
+        {
-            //String post_id = ef_pages.getJSONObject(i).getString("post_id");
-            //......
+        }
-        */
-        //String pageName = json_obj.getJSONObject("pageInfo").getString("pageName");
-/*
-        JSONArray arr = obj.getJSONArray("posts");
-        for (int i = 0; i < arr.length(); i++)
+        {
-            String post_id = arr.getJSONObject(i).getString("post_id");
-            ......
+        }
-*/
         ids.add(volume_id);
         _progress_accum.add(_progress_step);
+        return ids.iterator();
+        //return ids.iterator();
+    }
+}

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java

-              r30984
+              r30985
         DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("ProgressPercent");
-        //sc.parallelize(Arrays.asList(1, 2, 3, 4)).foreach(x -> accum.add(x));
-        // ...
-        // 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s
-        //accum.value();
         PagedJSON paged_json = new PagedJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
         JavaRDD<String> json_ids = json_list_data.flatMap(paged_json).cache();
+        //JavaRDD<String> json_ids = json_list_data.flatMap(paged_json).cache();
+        long num_ids = json_ids.count();
+        json_list_data.foreach(paged_json);
+        //long num_ids = json_ids.count();
+        long num_ids = num_volumes;
         System.out.println("");
         System.out.println("############");
 …
         System.out.println("");
+        /*
         if (_output_dir != null) {
             String rdd_save_file = "rdd-solr-json-page-files";
 …
             System.out.println("");
+        }
+        */
         jsc.close();
 …
         Options options = new Options();
-        //.withType(Integer.class)
-/*
-        options.addOption(OptionBuilder.withLongOpt("verbosity")
-                .withDescription("Set to control the level of debugging output [0=none, 1=some, 2=lots]")
-                .hasArg()
-                .withArgName("v")
-                .isRequired(false)
-                .create());
-*/
-        //Option num_cores_opt = new Option("n", "num-cores", true, "Number of cores to use");
-        //num_cores_opt.setRequired(false);
-        //options.addOption(num_cores_opt);
         Option verbosity_opt = new Option("v", "verbosity", true,
 …
             print_usage(formatter,options);
             System.exit(1);
-            //return;  // prevents 'cmd may not be assigned' compiler error in Eclipse
+        }
-        //value = ((Integer)cmdLine.getParsedOptionValue("num-cores")).intValue();
-        //value = ((Integer)cmdLine.getOptionValue("num-cores","2")).intValue();
-        //cmd.hasOption("json-filelist")
         String verbosity_str = cmd.getOptionValue("verbosity","0");
 …
         String input_dir  = filtered_args[0];
         String json_list_filename = filtered_args[1];
+        //String output_dir = filtered_args[2];
         PrepareForIngest prep_for_ingest
             = new PrepareForIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 30985

Legend:

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java

Download in other formats: