Context Navigation

← Previous Change
Next Change →

Changeset 31252 for other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java

Timestamp:

2016-12-20T14:15:05+13:00 (7 years ago)

Author:

davidb

Message:

Support for icu-tokenize property added, plus relevant refactoring.

Location:

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures

Files:

: 6 edited

PerPageJSONFlatmap.java (modified) (4 diffs)
PerVolumeJSON.java (modified) (4 diffs)
PerVolumeWordStreamFlatmap.java (modified) (3 diffs)
ProcessForSolrIngest.java (modified) (2 diffs)
ProcessForWhitelist.java (modified) (1 diff)
SolrDocJSON.java (modified) (7 diffs)

Legend:

: Unmodified
: Added
: Removed

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java

-              r31226
+              r31252
     protected double            _progress_step;
+    boolean _icu_tokenize;
     boolean _strict_file_io;
 …
                               String solr_url, String output_dir, int verbosity,
                               DoubleAccumulator progress_accum, double progress_step,
                               boolean strict_file_io)
+                              boolean icu_tokenize, boolean strict_file_io)
+    {
         _input_dir  = input_dir;
 …
         _progress_step  = progress_step;
+        _icu_tokenize   = icu_tokenize;
         _strict_file_io = strict_file_io;
 …
                     // Convert to Solr add form
                     JSONObject solr_add_doc_json
                         = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter);
+                        = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter,_icu_tokenize);
                     solr_add_doc_json.put("filename_json_bz2", output_json_bz2);

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

-              r31226
+              r31252
 package org.hathitrust.extractedfeatures;
+import java.io.IOException;
 import org.apache.spark.api.java.function.VoidFunction;
 …
     protected double            _progress_step;
+     boolean _icu_tokenize;
+     boolean _strict_file_io;
     public PerVolumeJSON(String input_dir, String whitelist_filename,
                          String solr_url, String output_dir, int verbosity,
+                         DoubleAccumulator progress_accum, double progress_step)
+                         DoubleAccumulator progress_accum, double progress_step,
+                         boolean icu_tokenize, boolean strict_file_io)
+    {
         _input_dir  = input_dir;
 …
         _progress_step  = progress_step;
+        _icu_tokenize   = icu_tokenize;
+        _strict_file_io = strict_file_io;
         _whitelist_bloomfilter = null;
+    }
     //public Iterator<String> call(String json_file_in)
     public void call(String json_file_in)
+    public void call(String json_file_in) throws IOException
+    {
         if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
 …
+        }
+        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(_input_dir + "/" + json_file_in);
+        String full_json_file_in = _input_dir + "/" + json_file_in;
+        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
+        String volume_id = extracted_feature_record.getString("id");
+        //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
+        //String title= ef_metadata.getString("title");
+        JSONObject ef_features = extracted_feature_record.getJSONObject("features");
+        int ef_page_count = ef_features.getInt("pageCount");
+        if (_verbosity >= 1) {
+            System.out.println("Processing: " + json_file_in);
+            System.out.println("  pageCount = " + ef_page_count);
+        }
+        JSONArray ef_pages = ef_features.getJSONArray("pages");
+        int ef_num_pages = ef_pages.length();
+        // Make directory for page-level JSON output
+        String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
+        String page_json_dir = json_dir + "/pages";
+        if (_output_dir != null) {
+            ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
+        }
+        //ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
+        for (int i = 0; i < ef_page_count; i++) {
+            String formatted_i = String.format("page-%06d", i);
+            String page_id = volume_id + "." + formatted_i;
+            if (_verbosity >= 2) {
+              System.out.println("  Page: " + page_id);
+        if (extracted_feature_record != null) {
+            String volume_id = extracted_feature_record.getString("id");
+            //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
+            //String title= ef_metadata.getString("title");
+            JSONObject ef_features = extracted_feature_record.getJSONObject("features");
+            int ef_page_count = ef_features.getInt("pageCount");
+            if (_verbosity >= 1) {
+                System.out.println("Processing: " + json_file_in);
+                System.out.println("  pageCount = " + ef_page_count);
+            }
+            String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
+            //ids.add(output_json_bz2); // ****
+            if (i==0) {
+                System.out.println("Sample output JSON page file: " + output_json_bz2);
+            JSONArray ef_pages = ef_features.getJSONArray("pages");
+            int ef_num_pages = ef_pages.length();
+            // Make directory for page-level JSON output
+            String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
+            String page_json_dir = json_dir + "/pages";
+            if (_output_dir != null) {
+                ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
+            }
-            JSONObject ef_page = ef_pages.getJSONObject(i);
             if (ef_page != null) {
                 // Convert to Solr add form
                 JSONObject solr_add_doc_json
                     = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter);
+            //ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
+            for (int i = 0; i < ef_page_count; i++) {
+                String formatted_i = String.format("page-%06d", i);
+                String page_id = volume_id + "." + formatted_i;
+                if ((_verbosity >=2) && (i==20)) {
+                    System.out.println("==================");
+                    System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
+                    System.out.println("==================");
+                if (_verbosity >= 2) {
+                    System.out.println("  Page: " + page_id);
+                }
+                if (_solr_url != null) {
+                String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
+                //ids.add(output_json_bz2); // ****
+                if (i==0) {
+                    System.out.println("Sample output JSON page file: " + output_json_bz2);
+                }
+                JSONObject ef_page = ef_pages.getJSONObject(i);
+                if (ef_page != null) {
+                    // Convert to Solr add form
+                    JSONObject solr_add_doc_json
+                    = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
                     if ((_verbosity >=2) && (i==20)) {
                         System.out.println("==================");
                         System.out.println("Posting to: " + _solr_url);
+                        System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
                         System.out.println("==================");
+                    }
+                    SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
+                    if (_solr_url != null) {
+                        if ((_verbosity >=2) && (i==20)) {
+                            System.out.println("==================");
+                            System.out.println("Posting to: " + _solr_url);
+                            System.out.println("==================");
+                        }
+                        SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
+                    }
+                    if (_output_dir != null) {
+                        if ((_verbosity >=2) && (i==20)) {
+                            System.out.println("==================");
+                            System.out.println("Saving to: " + _output_dir);
+                            System.out.println("==================");
+                        }
+                        SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
+                    }
+                }
+                else {
+                    System.err.println("Skipping: " + page_id);
+                }
+                if (_output_dir != null) {
+                    if ((_verbosity >=2) && (i==20)) {
+                        System.out.println("==================");
+                        System.out.println("Saving to: " + _output_dir);
+                        System.out.println("==================");
+                    }
+                    SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
+                }
+            }
+        }
+        else {
+            // File did not exist, or could not be parsed
+            String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
+            if (_strict_file_io) {
+                throw new IOException(mess);
+            }
             else {
+                System.err.println("Skipping: " + page_id);
+                System.err.println("Warning: " + mess);
+                System.out.println("Warning: " + mess);
+            }
+        }
         //ids.add(volume_id);

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeWordStreamFlatmap.java

-              r31242
+              r31252
     protected double            _progress_step;
+    boolean _icu_tokenize;
     boolean _strict_file_io;
     public PerVolumeWordStreamFlatmap(String input_dir, int verbosity,
                               DoubleAccumulator progress_accum, double progress_step,
+                              boolean icu_tokenize,
                               boolean strict_file_io)
+    {
 …
         _progress_step  = progress_step;
+        _icu_tokenize   = icu_tokenize;
         _strict_file_io = strict_file_io;
+    }
 …
                 if (ef_page != null) {
                     ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page);
+                    ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountText(volume_id, page_id, ef_page, _icu_tokenize);
                     all_word_list.addAll(page_word_list);
+                }

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

-              r31220
+              r31252
         System.err.println();
+        boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
+        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
         PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_whitelist_filename,
+                                                       _solr_url,_output_dir,_verbosity, progress_accum,per_vol);
+                                                       _solr_url,_output_dir,_verbosity, progress_accum,per_vol,
+                                                       icu_tokenize,strict_file_io);
         json_list_data.foreach(per_vol_json);
 …
         DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent");
         //String strict_file_io_str = System.getProperty("wcsa-ef-ingest.strict-file-io","true");
+        boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
         boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
         PerPageJSONFlatmap paged_solr_json_flatmap
             = new PerPageJSONFlatmap(_input_dir,_whitelist_filename,
                                      _solr_url,_output_dir,_verbosity,
                                      per_vol_progress_accum,per_vol,
                                      strict_file_io);
+                                     icu_tokenize,strict_file_io);
         JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache();

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java

-              r31251
+              r31252
         boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
+        boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
         PerVolumeWordStreamFlatmap paged_solr_wordfreq_flatmap
             = new PerVolumeWordStreamFlatmap(_input_dir,_verbosity,
                                      per_vol_progress_accum,per_vol,
+                                     icu_tokenize,
                                      strict_file_io);
         JavaRDD<String> words = json_list_data.flatMap(paged_solr_wordfreq_flatmap);

other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/SolrDocJSON.java

-              r31245
+              r31252
 public class SolrDocJSON {
     protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id)
+    {
         boolean solr_icu_tokenize = true;
+    protected static ArrayList<String> getTokenPosCountWords(JSONObject ef_token_pos_count, String page_id,
+                                                             boolean icu_tokenize)
+    {
         ArrayList<String> words = new ArrayList<String>();
 …
                 String token = token_iter.next();
                 if (solr_icu_tokenize == true) {
+                if (icu_tokenize == true) {
                     Reader reader = new StringReader(token);
 …
     protected static String generateSolrText(JSONObject ef_token_pos_count, String page_id,
                                             WhitelistBloomFilter whitelist_bloomfilter)
+    {
         ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id);
+                                            WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
+    {
+        ArrayList<String> tokens = getTokenPosCountWords(ef_token_pos_count, page_id,icu_tokenize);
         StringBuilder sb = new StringBuilder();
 …
     protected static JSONObject generateSolrDocJSON(String volume_id, String page_id, JSONObject ef_page,
                                                     WhitelistBloomFilter whitelist_bloomfilter)
+                                                    WhitelistBloomFilter whitelist_bloomfilter, boolean icu_tokenize)
+    {
         JSONObject solr_update_json = null;
 …
                     JSONObject solr_add_json = new JSONObject();
                     String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter);
+                    String text = generateSolrText(ef_token_pos_count,page_id,whitelist_bloomfilter,icu_tokenize);
                     JSONObject solr_doc_json = new JSONObject();
 …
+    }
+    protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page)
+    protected static ArrayList<String> generateTokenPosCountText(String volume_id, String page_id, JSONObject ef_page,
+                                                                 boolean icu_tokenize)
+    {
         ArrayList<String> word_list = null;
 …
             if (ef_body != null) {
                 JSONObject ef_token_pos_count = ef_body.getJSONObject("tokenPosCount");
                 word_list = getTokenPosCountWords(ef_token_pos_count,page_id);
+                word_list = getTokenPosCountWords(ef_token_pos_count,page_id,icu_tokenize);
+            }
             else {

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: