Changeset 30951


Ignore:
Timestamp:
2016-10-26T17:54:44+13:00 (5 years ago)
Author:
davidb
Message:

Save a JSONObject as a file in the output directory

Location:
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PagedJSON.java

    r30949 r30951  
    22
    33import java.io.BufferedReader;
     4import java.io.BufferedWriter;
     5import java.io.IOException;
    46import java.util.ArrayList;
    57import java.util.Iterator;
    68
     9import org.apache.commons.compress.compressors.CompressorException;
    710import org.apache.spark.api.java.function.FlatMapFunction;
    811import org.json.JSONArray;
     
    8588        String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
    8689        String page_json_dir = json_dir + "/pages";
    87         //ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
    88         System.out.println("mkdir: " + _output_dir + "/" + page_json_dir);
     90        ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
     91        //System.out.println("mkdir: " + _output_dir + "/" + page_json_dir);
    8992       
    9093        ArrayList<String> ids = new ArrayList<String>(ef_num_pages);
     
    97100            }
    98101           
    99             // create JSON obj of just the page (for now)
    100             // write it out
     102           
    101103           
    102104            String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
    103            
    104105            ids.add(output_json_bz2);
    105106           
     
    107108                System.out.println("Sample output JSON page file: " + output_json_bz2);
    108109            }
     110           
     111            // create JSON obj of just the page (for now), and write it out
     112            JSONObject ef_page = ef_pages.getJSONObject(i);
     113            try {
     114                BufferedWriter bw = ClusterFileIO.getBufferedWriterForCompressedFile(output_json_bz2);
     115                bw.write(ef_page.toString());
     116                bw.close();
     117            } catch (IOException e) {
     118                e.printStackTrace();
     119            } catch (CompressorException e) {
     120                e.printStackTrace();
     121            }
     122                       
     123           
    109124        }
    110125       
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java

    r30949 r30951  
    4545        JavaRDD<String> json_ids = json_list_data.flatMap(paged_json).cache();
    4646
    47         json_ids.saveAsTextFile("rdd-solr-json-page-files");
    48 
    4947        long num_ids = json_ids.count();
    5048        System.out.println("");
     
    5452        System.out.println("");
    5553
     54        System.out.println("############");
     55        String rdd_save_file = "rdd-solr-json-page-files";
     56        json_ids.saveAsTextFile(rdd_save_file);
     57        System.out.println("# Saved RDD of Solr JSON page files, top-level, as:");
     58        System.out.println("#  " + rdd_save_file);
     59        System.out.println("############");
     60        System.out.println("");
     61       
    5662        jsc.close();
    5763    }
Note: See TracChangeset for help on using the changeset viewer.