Ignore:
Timestamp:
2016-12-20T16:44:40+13:00 (7 years ago)
Author:
davidb
Message:

Earlier check of output directory to prevent large scale processing, when saving ultimate output will not work

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java

    r31255 r31256  
    6060        JavaSparkContext jsc = new JavaSparkContext(conf);
    6161
     62        String filename_root = _json_list_filename.replaceAll(".*/","").replaceAll("\\..*$","");
     63        String output_directory = "whitelist-" + filename_root + "-out";
     64        if (ClusterFileIO.exists(output_dir))
     65        {
     66            System.err.println("Error: " + output_directory + " already exists.  Spark unable to write output data");
     67            jsc.close();
     68            System.exit(1);
     69        }
     70       
    6271        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
    6372        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache();
     
    127136        count_sorted.setName("descending-word-frequency");
    128137       
    129         String filename_root = _json_list_filename.replaceAll(".*/","").replaceAll("\\..*$","");
    130         String output_directory = "whitelist-" + filename_root + "-out";
     138       
    131139       
    132140        //sorted_swaped_back_pair.saveAsTextFile(output_directory);
Note: See TracChangeset for help on using the changeset viewer.