1 | import java.io.*;
|
---|
2 | import com.google.common.hash.*;
|
---|
3 | import com.google.common.io.Files;
|
---|
4 |
|
---|
5 | public class split {
|
---|
6 |
|
---|
7 | public static void main(String[] args) throws Exception
|
---|
8 | {
|
---|
9 | long StartTime = System.currentTimeMillis();
|
---|
10 |
|
---|
11 | //Default values for arguments
|
---|
12 | int NRecords = 250;
|
---|
13 | String OutputPath = "./out/hathi_out";
|
---|
14 | String InputPath = "./hathi_tab.txt";
|
---|
15 |
|
---|
16 | //Incorrect number of arguments supplied
|
---|
17 | if(args.length!=2)
|
---|
18 | {
|
---|
19 | System.err.println("USAGE: ./RUN_HATHI [records_per_file] [output_path]");
|
---|
20 | return;
|
---|
21 | }
|
---|
22 |
|
---|
23 | try{
|
---|
24 | NRecords = Integer.parseInt(args[0]);
|
---|
25 | OutputPath = args[1];
|
---|
26 | }
|
---|
27 | catch(Exception e)
|
---|
28 | {
|
---|
29 | System.err.println("Invalid Arguments");
|
---|
30 | System.err.println("USAGE: ./RUN [records_per_file] [output_path]");
|
---|
31 | return;
|
---|
32 | }
|
---|
33 |
|
---|
34 | String headerpath = "hathiTab_Split" + File.separator + "etc" + File.separator + "header.txt";
|
---|
35 | BufferedReader reader;
|
---|
36 |
|
---|
37 | try{
|
---|
38 | reader = new BufferedReader(new FileReader(headerpath));
|
---|
39 | }
|
---|
40 | catch(Exception e){
|
---|
41 | System.err.println(headerpath + " does not exist");
|
---|
42 | return;
|
---|
43 | }
|
---|
44 |
|
---|
45 | String header = reader.readLine();
|
---|
46 |
|
---|
47 | reader.close();
|
---|
48 | try{
|
---|
49 | reader = new BufferedReader(new FileReader(InputPath));
|
---|
50 | }
|
---|
51 | catch(Exception e){
|
---|
52 | System.err.println(InputPath + " does not exist");
|
---|
53 | return;
|
---|
54 | }
|
---|
55 |
|
---|
56 | String TempFilename = OutputPath + "/temp.txt";
|
---|
57 | File f;
|
---|
58 | int RecordCount = 0;
|
---|
59 |
|
---|
60 | String record = "";
|
---|
61 | while(record!=null)
|
---|
62 | {
|
---|
63 | f = new File(TempFilename);
|
---|
64 | FileWriter fwriter = new FileWriter(f);
|
---|
65 | BufferedWriter writer = new BufferedWriter(fwriter);
|
---|
66 |
|
---|
67 | //Write header to top of file
|
---|
68 | writer.write(header);
|
---|
69 | writer.newLine();
|
---|
70 |
|
---|
71 | //Write segment of records to file
|
---|
72 | for(int i=0; (i<NRecords && (record=reader.readLine())!=null); i++)
|
---|
73 | {
|
---|
74 | writer.write(record);
|
---|
75 | writer.newLine();
|
---|
76 | RecordCount++;
|
---|
77 | }
|
---|
78 | writer.close();
|
---|
79 | fwriter.close();
|
---|
80 |
|
---|
81 | //Print update
|
---|
82 | if(RecordCount%(NRecords * 50)==0)
|
---|
83 | System.err.print("\rProcessed " + RecordCount + " records");
|
---|
84 |
|
---|
85 | //Calculate MD5
|
---|
86 | HashCode hc = Files.hash(f, Hashing.md5());
|
---|
87 | String s = hc.toString();
|
---|
88 |
|
---|
89 | //Ensure target folder exists, then rename file to hash string
|
---|
90 | File target = new File(OutputPath+"/"+s.substring(0, 2)+"/"+s.substring(2)+".tab");
|
---|
91 | File parent = target.getParentFile();
|
---|
92 | if(!parent.exists() && !parent.mkdirs()){
|
---|
93 | throw new IllegalStateException("Couldn't create dir " + parent);
|
---|
94 | }
|
---|
95 | f.renameTo(target);
|
---|
96 | }
|
---|
97 | reader.close();
|
---|
98 | System.err.print("\rProcessed " + RecordCount + " records");
|
---|
99 | System.err.println("\nTime taken: " + (System.currentTimeMillis()-StartTime) + "ms");
|
---|
100 | }
|
---|
101 | }
|
---|