Changeset 31222 for other-projects/hathitrust
- Timestamp:
- 2016-12-12T23:22:33+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ClusterFileIO.java
r31088 r31222 45 45 46 46 47 p rotectedstatic FileSystem getFileSystemInstance(String input_dir)47 public static FileSystem getFileSystemInstance(String input_dir) 48 48 { 49 49 FileSystem fs = null; 50 50 51 52 Configuration conf = new Configuration();53 URI uri = new URI(input_dir);54 fs = FileSystem.newInstance(uri,conf);55 56 57 e.printStackTrace();58 59 60 e.printStackTrace();61 51 try { 52 Configuration conf = new Configuration(); 53 URI uri = new URI(input_dir); 54 fs = FileSystem.newInstance(uri,conf); 55 } 56 catch (URISyntaxException e) { 57 e.printStackTrace(); 58 } 59 catch (IOException e) { 60 e.printStackTrace(); 61 } 62 62 63 63 return fs; -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/WhitelistBloomFilter.java
r31215 r31222 9 9 import java.io.IOException; 10 10 import java.io.InputStream; 11 import java.io.Serializable; 12 import java.nio.charset.Charset; 11 import java.net.URI; 13 12 import java.nio.charset.StandardCharsets; 14 13 import java.nio.file.Files; … … 16 15 import java.util.stream.Stream; 17 16 18 import javax.annotation.Nullable; 17 import org.apache.hadoop.fs.FSDataInputStream; 18 import org.apache.hadoop.fs.FSDataOutputStream; 19 import org.apache.hadoop.fs.FileSystem; 20 import org.apache.hadoop.fs.Path; 19 21 20 import com.google.common.base.Preconditions;21 22 import com.google.common.hash.BloomFilter; 22 23 import com.google.common.hash.Funnel; 23 24 import com.google.common.hash.Funnels; 24 import com.google.common.hash.PrimitiveSink;25 25 26 26 public class WhitelistBloomFilter { … … 30 30 protected static final double FALSE_POSITIVE_PERCENTAGE = 0.01; 31 31 32 33 32 // http://stackoverflow.com/questions/453018/number-of-lines-in-a-file-in-java 34 p ublicstatic int countLines(String filename) throws IOException33 protected static int countLines(String filename) throws IOException 35 34 { 36 35 InputStream is = new BufferedInputStream(new FileInputStream(filename)); … … 59 58 System.out.println("Constructing: WhitelistBloomFilter"); 60 59 61 File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX);62 60 63 if (ser_dictionary_file.exists()) { 61 String ser_dictionary_filename = dictionary_filename + SERIALIZED_SUFFIX; 62 63 64 //File ser_dictionary_file = new File(dictionary_filename + SERIALIZED_SUFFIX); 65 66 //if (ser_dictionary_file.exists()) { 67 if (ClusterFileIO.exists(ser_dictionary_filename)) { 64 68 System.out.println("Loading Serialized Bloom filter ..."); 65 _bloomFilter = serializeIn(ser_dictionary_file );69 _bloomFilter = serializeIn(ser_dictionary_filename); 66 70 System.out.println("... done"); 67 71 } … … 103 107 System.out.println("Serializing Bloom filter ..."); 104 108 105 File ser_dictionary = new File(filename + SERIALIZED_SUFFIX); 106 serializeOut(ser_dictionary); 109 String ser_filename = filename + SERIALIZED_SUFFIX; 110 111 //File ser_dictionary = new File(ser_filename); 112 serializeOut(ser_filename); 107 113 108 114 System.out.println("... done"); … … 116 122 } 117 123 118 protected void serializeOut(File ser_file) 124 //protected void serializeOut(File ser_file) 125 protected void serializeOut(String ser_filename) 119 126 { 127 //FileSystem fs = ClusterFileIO.getFileSystemInstance(ser_filename); 128 120 129 try { 121 FileOutputStream fos = new FileOutputStream(ser_file); 130 //URI ser_uri = URI.create (ser_filename); 131 //Path ser_path = new Path(ser_uri); 132 133 BufferedOutputStream bos = ClusterFileIO.getBufferedOutputStream(ser_filename); 134 135 //FileOutputStream fos = new FileOutputStream(ser_file); 122 136 123 BufferedOutputStream bfos = new BufferedOutputStream(fos);137 //BufferedOutputStream bfos = new BufferedOutputStream(fos); 124 138 125 _bloomFilter.writeTo(b fos);139 _bloomFilter.writeTo(bos); 126 140 127 b fos.close();141 bos.close(); 128 142 } 129 143 catch (FileNotFoundException e) { 130 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 144 System.err.println("Unable to open Bloom file:" + ser_filename); 145 //System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 146 131 147 e.printStackTrace(); 132 148 } catch (IOException e) { 133 System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath()); 149 System.err.println("Error reading in Bloom file:" + ser_filename); 150 //System.err.println("Error reading in Bloom file:" + ser_file.getAbsolutePath()); 134 151 e.printStackTrace(); 135 152 } 136 153 } 137 154 138 protected static BloomFilter<CharSequence> serializeIn(File ser_file) 155 //protected static BloomFilter<CharSequence> serializeIn(File ser_file) 156 protected static BloomFilter<CharSequence> serializeIn(String ser_filename) 139 157 { 140 158 BloomFilter<CharSequence> bloomFilter = null; 141 159 160 //FileSystem fs = ClusterFileIO.getFileSystemInstance(ser_filename); 161 142 162 try { 143 FileInputStream fis = new FileInputStream(ser_file); 144 BufferedInputStream bfis = new BufferedInputStream(fis); 163 //URI ser_uri = URI.create (ser_filename); 164 //Path ser_path = new Path(ser_uri); 165 166 //FSDataInputStream fsdis = fs.open(ser_path); 167 //BufferedInputStream bis = new BufferedInputStream(fsdis); 168 169 BufferedInputStream bis = ClusterFileIO.getBufferedInputStream(ser_filename); 170 171 //FileInputStream fis = new FileInputStream(ser_file); 172 //BufferedInputStream bfis = new BufferedInputStream(fis); 145 173 146 174 Funnel<CharSequence> string_funnel = Funnels.stringFunnel(StandardCharsets.UTF_8); 147 bloomFilter = BloomFilter.readFrom(b fis,string_funnel);175 bloomFilter = BloomFilter.readFrom(bis,string_funnel); 148 176 149 b fis.close();177 bis.close(); 150 178 } 151 179 catch (FileNotFoundException e) { 152 System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 180 //System.err.println("Unable to open Bloom file:" + ser_file.getAbsolutePath()); 181 //System.err.println("Unable to open Bloom file:" + ser_path.getName()); 182 System.err.println("Unable to open Bloom file:" + ser_filename); 183 153 184 e.printStackTrace(); 154 185 } catch (IOException e) { 155 System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath()); 186 //System.err.println("Error writing out Bloom file:" + ser_file.getAbsolutePath()); 187 //System.err.println("Error writing out Bloom file:" + ser_path.getName()); 188 System.err.println("Error writing out Bloom file:" + ser_filename); 156 189 e.printStackTrace(); 157 190 }
Note:
See TracChangeset
for help on using the changeset viewer.