source: main/trunk/model-sites-dev/von-sparql/collect/nz-natlib-cat/pre-import/hathiTab_Split/src/split.java@ 28791

Last change on this file since 28791 was 28791, checked in by ak19, 10 years ago

Java program to split up the GB HathiTrust metadata file into smaller chunks that are more easily imported into Greenstone

File size: 2.5 KB
Line 
1import java.io.*;
2import com.google.common.hash.*;
3import com.google.common.io.Files;
4
5public class split {
6
7 public static void main(String[] args) throws Exception
8 {
9 long StartTime = System.currentTimeMillis();
10
11 //Default values for arguments
12 int NRecords = 250;
13 String OutputPath = "./out/hathi_out";
14 String InputPath = "./hathi_tab.txt";
15
16 //Incorrect number of arguments supplied
17 if(args.length!=2)
18 {
19 System.err.println("USAGE: ./RUN_HATHI [records_per_file] [output_path]");
20 return;
21 }
22
23 try{
24 NRecords = Integer.parseInt(args[0]);
25 OutputPath = args[1];
26 }
27 catch(Exception e)
28 {
29 System.err.println("Invalid Arguments");
30 System.err.println("USAGE: ./RUN [records_per_file] [output_path]");
31 return;
32 }
33
34 String headerpath = "hathiTab_Split" + File.separator + "etc" + File.separator + "header.txt";
35 BufferedReader reader;
36
37 try{
38 reader = new BufferedReader(new FileReader(headerpath));
39 }
40 catch(Exception e){
41 System.err.println(headerpath + " does not exist");
42 return;
43 }
44
45 String header = reader.readLine();
46
47 reader.close();
48 try{
49 reader = new BufferedReader(new FileReader(InputPath));
50 }
51 catch(Exception e){
52 System.err.println(InputPath + " does not exist");
53 return;
54 }
55
56 String TempFilename = OutputPath + "/temp.txt";
57 File f;
58 int RecordCount = 0;
59
60 String record = "";
61 while(record!=null)
62 {
63 f = new File(TempFilename);
64 FileWriter fwriter = new FileWriter(f);
65 BufferedWriter writer = new BufferedWriter(fwriter);
66
67 //Write header to top of file
68 writer.write(header);
69 writer.newLine();
70
71 //Write segment of records to file
72 for(int i=0; (i<NRecords && (record=reader.readLine())!=null); i++)
73 {
74 writer.write(record);
75 writer.newLine();
76 RecordCount++;
77 }
78 writer.close();
79 fwriter.close();
80
81 //Print update
82 if(RecordCount%(NRecords * 50)==0)
83 System.err.print("\rProcessed " + RecordCount + " records");
84
85 //Calculate MD5
86 HashCode hc = Files.hash(f, Hashing.md5());
87 String s = hc.toString();
88
89 //Ensure target folder exists, then rename file to hash string
90 File target = new File(OutputPath+"/"+s.substring(0, 2)+"/"+s.substring(2)+".tab");
91 File parent = target.getParentFile();
92 if(!parent.exists() && !parent.mkdirs()){
93 throw new IllegalStateException("Couldn't create dir " + parent);
94 }
95 f.renameTo(target);
96 }
97 reader.close();
98 System.err.print("\rProcessed " + RecordCount + " records");
99 System.err.println("\nTime taken: " + (System.currentTimeMillis()-StartTime) + "ms");
100 }
101}
Note: See TracBrowser for help on using the repository browser.