Changeset 31256 for other-projects

Show
Ignore:
Timestamp:
20.12.2016 16:44:40 (3 years ago)
Author:
davidb
Message:

Earlier check of output directory to prevent large scale processing, when saving ultimate output will not work

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java

    r31255 r31256  
    6060        JavaSparkContext jsc = new JavaSparkContext(conf); 
    6161 
     62        String filename_root = _json_list_filename.replaceAll(".*/","").replaceAll("\\..*$",""); 
     63        String output_directory = "whitelist-" + filename_root + "-out"; 
     64        if (ClusterFileIO.exists(output_dir)) 
     65        { 
     66            System.err.println("Error: " + output_directory + " already exists.  Spark unable to write output data"); 
     67            jsc.close(); 
     68            System.exit(1); 
     69        } 
     70         
    6271        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
    6372        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 
     
    127136        count_sorted.setName("descending-word-frequency"); 
    128137         
    129         String filename_root = _json_list_filename.replaceAll(".*/","").replaceAll("\\..*$",""); 
    130         String output_directory = "whitelist-" + filename_root + "-out"; 
     138         
    131139         
    132140        //sorted_swaped_back_pair.saveAsTextFile(output_directory);