Context Navigation

← Previous Change
Next Change →

Changeset 30945 for other-projects/hathitrust

Timestamp:

2016-10-26T15:37:24+13:00 (7 years ago)

Author:

davidb

Message:

Getting closer to writing out JSON files

Location:

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust

Files:

: 3 edited

ClusterFileIO.java (modified) (7 diffs)
PagedJSON.java (modified) (5 diffs)
PrepareForIngest.java (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/ClusterFileIO.java

-              r30941
+              r30945
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.spark.api.java.JavaSparkContext;
 public class ClusterFileIO {
 …
     */
     public static FileSystem getFileSystemInstance(String input_dir)
+    protected static FileSystem getFileSystemInstance(String input_dir)
+    {
         if (__fs == null) {
 …
+    }
     protected static boolean exists(String file)
+    public static boolean exists(String file)
+    {
         FileSystem fs = getFileSystemInstance(file);
 …
         return exists;
+    }
+    protected static BufferedInputStream getBufferedInputStream(String fileIn)
+    public static String removeSuffix(String file,String suffix)
+    {
+        return file.substring(0,file.length() - suffix.length());
+    }
+    public static boolean createDirectoryAll(String dir)
+    {
+        FileSystem fs = getFileSystemInstance(dir);
+        boolean created_dir = false;
+        if (!exists(dir)) {
+            try {
+                URI uri = new URI(dir);
+                Path path = new Path(uri);
+                fs.mkdirs(path);
+                created_dir = true;
+            } catch (URISyntaxException e) {
+                e.printStackTrace();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+        }
+        return created_dir;
+    }
+    public static BufferedInputStream getBufferedInputStream(String fileIn)
             throws IOException
+    {
 …
+    }
     protected static BufferedOutputStream getBufferedOutputStream(String fileOut)
+    public static BufferedOutputStream getBufferedOutputStream(String fileOut)
             throws IOException
+    {
 …
+    }
     protected static BufferedReader getBufferedReaderForCompressedFile(String fileIn)
+    public static BufferedReader getBufferedReaderForCompressedFile(String fileIn)
             throws IOException, CompressorException
+    {
 …
+    }
     protected static BufferedWriter getBufferedWriterForCompressedFile(String fileOut)
+    public static BufferedWriter getBufferedWriterForCompressedFile(String fileOut)
             throws IOException, CompressorException
+    {

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java

-              r30942
+              r30945
     protected String _input_dir;
+    protected String _output_dir;
     protected int    _verbosity;
     public PagedJSON(String input_dir, int verbosity)
+    public PagedJSON(String input_dir, String output_dir, int verbosity)
+    {
+        _input_dir = input_dir;
+        _verbosity = verbosity;
+        _input_dir  = input_dir;
+        _output_dir = output_dir;
+        _verbosity  = verbosity;
+    }
 …
             while ((str = br.readLine()) != null) {
                 sb.append(str);
-                //System.out.println(str);
+            }
             br.close();
-            //System.err.println("*****" + sb.toString());
-            /*
-            List<String> lines = Files.readAllLines(path,StandardCharsets.UTF_8);
-            for (String line : lines) {
-                sb.append(line);
+            }
-            */
+        }
         catch (Exception e) {
 …
         JSONObject json_obj = new JSONObject(sb.toString());
         return json_obj;
-        //return sb.toString();
+    }
 …
         String id = extracted_feature_record.getString("id");
         JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
+        //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
         JSONObject ef_features = extracted_feature_record.getJSONObject("features");
 …
         int ef_num_pages = ef_pages.length();
+        // Make directory for page-level JSON output
+        String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
+        String page_json_dir = json_dir + "/pages";
+        //ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
+        System.out.println("mkdir: " + _output_dir + "/" + page_json_dir);
         ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
         for (int i = 0; i < ef_page_count; i++) {
+            ids.add(id + "." + i);
+            String formatted_i = String.format("page-%06d", i);
+            String page_id = id + "." + formatted_i;
+            if (_verbosity >= 2) {
+              System.out.println("  Page: " + page_id);
+            }
+            // create JSON obj of just the page (for now)
+            // write it out
+            ids.add(page_json_dir +"/" + page_id + ".json.bz2");
+            if (i==0) {
+                System.out.println("Sample output JSON page file: " + page_json_dir +"/" + page_id + ".json.bz2");
+            }
+        }

other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java

-              r30944
+              r30945
 package org.hathitrust;
-import java.io.IOException;
 import java.io.Serializable;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.List;
 import org.apache.commons.cli.*;
 import org.apache.spark.api.java.*;
 import org.apache.spark.SparkConf;
-import org.apache.spark.api.java.function.Function;
 public class PrepareForIngest implements Serializable
 …
     private static final long serialVersionUID = 1L;
+    public static final int NUM_PARTITIONS = 6; // default would appear to be 2
     protected String _input_dir;
     protected String _json_list_filename;
 …
         SparkConf conf = new SparkConf().setAppName(spark_app_name);
         JavaSparkContext jsc = new JavaSparkContext(conf);
-        //ClusterFileIO.init(_input_dir);
+        // Check output directory exists, and create it if not
+        if (_verbosity >= 1) {
+        if (_verbosity >= 2) {
             System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
             System.out.println("Default Parallelism: " + jsc.defaultParallelism());
+        }
         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,6).cache();
+        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
+        JavaRDD<String> json_ids = json_list_data.flatMap(new PagedJSON(_input_dir,_verbosity));
+        PagedJSON paged_json = new PagedJSON(_input_dir,_output_dir,_verbosity);
+        JavaRDD<String> json_ids = json_list_data.flatMap(paged_json).cache();
+        json_ids.saveAsTextFile("foo");
-        //long numAs = json_list_data.filter(new ContainsA()).count();
-        /*
-        long numBs = json_list_data.filter(new Function<String, Boolean>() {
-            public Boolean call(String s) { return s.contains("b"); }
-        }).count();
-        System.out.println("#### Lines with a: " + numAs + ", lines with b: " + numBs);
-         */
         long num_ids = json_ids.count();
         System.out.println("");
 …
         //.withType(Integer.class)
+/*
         options.addOption(OptionBuilder.withLongOpt("verbosity")
                 .withDescription("Set to control the level of debugging output [0=none, 1=some, 2=lots]")
 …
                 .isRequired(false)
                 .create());
+*/
         //Option num_cores_opt = new Option("n", "num-cores", true, "Number of cores to use");
         //num_cores_opt.setRequired(false);
         //options.addOption(num_cores_opt);
+        Option verbosity_opt = new Option("v", "verbosity", true,
+                "Set to control the level of debugging output [0=none, 1=some, 2=lots]");
+        verbosity_opt.setRequired(false);
+        options.addOption(verbosity_opt);
         //CommandLineParser parser = new DefaultParser(); // 1.3 and above
+        CommandLineParser parser = new GnuParser();
+        // need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
+        CommandLineParser parser = new GnuParser();
         HelpFormatter formatter = new HelpFormatter();
         CommandLine cmd;
 …
         //cmd.hasOption("json-filelist")
         String verbosity_str = cmd.getOptionValue("verbosity","0");
         int verbosity = Integer.parseInt(verbosity_str);
-        //System.out.println(inputFilePath);
-        //System.out.println(outputFilePath);
         String[] filtered_args = cmd.getArgs();
 …
         String output_dir = filtered_args[2];
-        //String json_list_filename = cmd.getArgs()[0]; // args[0];
-        //String json_list_filename = args[0];
-        //int num_cores = 2;
         PrepareForIngest prep_for_ingest = new PrepareForIngest(input_dir,json_list_filename,output_dir,verbosity);
         prep_for_ingest.exec();

Note: See TracChangeset for help on using the changeset viewer.