Changeset 31222 for other-projects

Show
Ignore:
Timestamp:
12.12.2016 23:22:33 (3 years ago)
Author:
davidb
Message:

Changed to using ClusterFileIO supporting methods

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ClusterFileIO.java

    r31088 r31222  
    4545 
    4646     
    47     protected static FileSystem getFileSystemInstance(String input_dir) 
     47    public static FileSystem getFileSystemInstance(String input_dir) 
    4848    { 
    49         FileSystem fs = null; 
     49        FileSystem fs = null; 
    5050 
    51         try { 
    52         Configuration conf = new Configuration(); 
    53         URI uri = new URI(input_dir); 
    54         fs = FileSystem.newInstance(uri,conf); 
    55         }  
    56         catch (URISyntaxException e) { 
    57         e.printStackTrace();     
    58         }  
    59         catch (IOException e) { 
    60         e.printStackTrace(); 
    61         } 
     51        try { 
     52            Configuration conf = new Configuration(); 
     53            URI uri = new URI(input_dir); 
     54            fs = FileSystem.newInstance(uri,conf); 
     55        }  
     56        catch (URISyntaxException e) { 
     57            e.printStackTrace();     
     58        }  
     59        catch (IOException e) { 
     60            e.printStackTrace(); 
     61        } 
    6262 
    6363        return fs; 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java

    r31215 r31222  
    99import java.io.IOException; 
    1010import java.io.InputStream; 
    11 import java.io.Serializable; 
    12 import java.nio.charset.Charset; 
     11import java.net.URI; 
    1312import java.nio.charset.StandardCharsets; 
    1413import java.nio.file.Files; 
     
    1615import java.util.stream.Stream; 
    1716 
    18 import javax.annotation.Nullable; 
     17import org.apache.hadoop.fs.FSDataInputStream; 
     18import org.apache.hadoop.fs.FSDataOutputStream; 
     19import org.apache.hadoop.fs.FileSystem; 
     20import org.apache.hadoop.fs.Path; 
    1921 
    20 import com.google.common.base.Preconditions; 
    2122import com.google.common.hash.BloomFilter; 
    2223import com.google.common.hash.Funnel; 
    2324import com.google.common.hash.Funnels; 
    24 import com.google.common.hash.PrimitiveSink; 
    2525 
    2626public class WhitelistBloomFilter { 
     
    3030    protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01; 
    3131     
    32      
    3332    // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java 
    34     public static int countLines(String filename) throws IOException  
     33    protected static int countLines(String filename) throws IOException  
    3534    { 
    3635        InputStream is = new BufferedInputStream(new FileInputStream(filename)); 
     
    5958        System.out.println("Constructing: WhitelistBloomFilter"); 
    6059 
    61         File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX); 
    6260         
    63         if (ser_dictionary_file.exists()) { 
     61        String ser_dictionary_filename = dictionary_filename + SERIALIZED_SUFFIX; 
     62         
     63         
     64        //File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX); 
     65         
     66        //if (ser_dictionary_file.exists()) { 
     67        if (ClusterFileIO.exists(ser_dictionary_filename)) { 
    6468            System.out.println("Loading Serialized Bloom filter ..."); 
    65             _bloomFilter = serializeIn(ser_dictionary_file); 
     69            _bloomFilter = serializeIn(ser_dictionary_filename); 
    6670            System.out.println("... done"); 
    6771        } 
     
    103107            System.out.println("Serializing Bloom filter ..."); 
    104108 
    105             File ser_dictionary = new File(filename + SERIALIZED_SUFFIX); 
    106             serializeOut(ser_dictionary); 
     109            String ser_filename = filename + SERIALIZED_SUFFIX; 
     110             
     111            //File ser_dictionary = new File(ser_filename); 
     112            serializeOut(ser_filename); 
    107113 
    108114            System.out.println("... done"); 
     
    116122    } 
    117123 
    118     protected void serializeOut(File ser_file) 
     124    //protected void serializeOut(File ser_file) 
     125    protected void serializeOut(String ser_filename) 
    119126    { 
     127        //FileSystem fs = ClusterFileIO.getFileSystemInstance(ser_filename); 
     128         
    120129        try { 
    121             FileOutputStream fos = new FileOutputStream(ser_file); 
     130            //URI ser_uri = URI.create (ser_filename); 
     131            //Path ser_path = new Path(ser_uri); 
     132             
     133            BufferedOutputStream bos = ClusterFileIO.getBufferedOutputStream(ser_filename); 
     134             
     135            //FileOutputStream fos = new FileOutputStream(ser_file); 
    122136 
    123             BufferedOutputStream bfos = new BufferedOutputStream(fos); 
     137            //BufferedOutputStream bfos = new BufferedOutputStream(fos); 
    124138 
    125             _bloomFilter.writeTo(bfos); 
     139            _bloomFilter.writeTo(bos); 
    126140 
    127             bfos.close(); 
     141            bos.close(); 
    128142        } 
    129143        catch (FileNotFoundException e) { 
    130             System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 
     144            System.err.println("Unable to open Bloom file:" + ser_filename); 
     145            //System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 
     146             
    131147            e.printStackTrace(); 
    132148        } catch (IOException e) { 
    133             System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath()); 
     149            System.err.println("Error reading in Bloom file:" + ser_filename); 
     150            //System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath()); 
    134151            e.printStackTrace(); 
    135152        } 
    136153    } 
    137154 
    138     protected static BloomFilter<CharSequence> serializeIn(File ser_file) 
     155    //protected static BloomFilter<CharSequence> serializeIn(File ser_file) 
     156    protected static BloomFilter<CharSequence> serializeIn(String ser_filename) 
    139157    { 
    140158        BloomFilter<CharSequence> bloomFilter = null; 
    141159     
     160        //FileSystem fs = ClusterFileIO.getFileSystemInstance(ser_filename); 
     161         
    142162        try { 
    143             FileInputStream fis = new FileInputStream(ser_file); 
    144             BufferedInputStream bfis = new BufferedInputStream(fis); 
     163            //URI ser_uri = URI.create (ser_filename); 
     164            //Path ser_path = new Path(ser_uri); 
     165 
     166            //FSDataInputStream fsdis = fs.open(ser_path); 
     167            //BufferedInputStream bis = new BufferedInputStream(fsdis); 
     168             
     169            BufferedInputStream bis = ClusterFileIO.getBufferedInputStream(ser_filename); 
     170             
     171            //FileInputStream fis = new FileInputStream(ser_file); 
     172            //BufferedInputStream bfis = new BufferedInputStream(fis); 
    145173 
    146174            Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8); 
    147             bloomFilter = BloomFilter.readFrom(bfis,string_funnel); 
     175            bloomFilter = BloomFilter.readFrom(bis,string_funnel); 
    148176 
    149             bfis.close(); 
     177            bis.close(); 
    150178        } 
    151179        catch (FileNotFoundException e) { 
    152             System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 
     180            //System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 
     181            //System.err.println("Unable to open Bloom file:" + ser_path.getName()); 
     182            System.err.println("Unable to open Bloom file:" + ser_filename); 
     183             
    153184            e.printStackTrace(); 
    154185        } catch (IOException e) { 
    155             System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath()); 
     186            //System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath()); 
     187            //System.err.println("Error writing out Bloom file:" + ser_path.getName()); 
     188            System.err.println("Error writing out Bloom file:" + ser_filename); 
    156189            e.printStackTrace(); 
    157190        }