1 | import java.io.*;
|
---|
2 | import com.google.common.hash.*;
|
---|
3 | import com.google.common.io.Files;
|
---|
4 | import org.marc4j.*;
|
---|
5 | import org.marc4j.marc.Record;
|
---|
6 |
|
---|
7 | public class split {
|
---|
8 |
|
---|
9 | public static void main(String[] args) throws Exception
|
---|
10 | {
|
---|
11 | long StartTime = System.currentTimeMillis();
|
---|
12 |
|
---|
13 | //Default values for arguments
|
---|
14 | int NRecords = 250;
|
---|
15 | String OutputPath = "./out/nz_out";
|
---|
16 | String InputPath = "./NZDataFull.xml";
|
---|
17 | //String InputPath = "./NZ-small-macron-test.xml";
|
---|
18 |
|
---|
19 | //Incorrect number of arguments supplied
|
---|
20 | if(args.length!=2)
|
---|
21 | {
|
---|
22 | System.err.println("USAGE: ./RUN [records_per_file] [output_path]");
|
---|
23 | return;
|
---|
24 | }
|
---|
25 |
|
---|
26 | try{
|
---|
27 | NRecords = Integer.parseInt(args[0]);
|
---|
28 | OutputPath = args[1];
|
---|
29 | }
|
---|
30 | catch(Exception e)
|
---|
31 | {
|
---|
32 | System.err.println("Invalid Arguments");
|
---|
33 | System.err.println("USAGE: ./RUN [records_per_file] [output_path]");
|
---|
34 | return;
|
---|
35 | }
|
---|
36 |
|
---|
37 | InputStream in;
|
---|
38 | try{
|
---|
39 | in = new FileInputStream(InputPath);
|
---|
40 | }
|
---|
41 | catch(Exception e){
|
---|
42 | System.err.println("./NZDataFull.xml does not exist");
|
---|
43 | return;
|
---|
44 | }
|
---|
45 |
|
---|
46 | MarcXmlReader reader = new MarcXmlReader(in);
|
---|
47 | String TempFilename = OutputPath + "/temp.xml";
|
---|
48 | File f;
|
---|
49 | int RecordCount = 0;
|
---|
50 | int ParseErrorCount = 0;
|
---|
51 |
|
---|
52 | while(reader.hasNext())
|
---|
53 | {
|
---|
54 | f = new File(TempFilename);
|
---|
55 | MarcWriter writer = new MarcXmlWriter(new FileOutputStream(f),"UTF8",true);
|
---|
56 | Record record;
|
---|
57 |
|
---|
58 | //Write segment of records to file
|
---|
59 | for(int i=0; (i<NRecords && reader.hasNext()); i++)
|
---|
60 | {
|
---|
61 | try {
|
---|
62 | record = reader.next();
|
---|
63 | }
|
---|
64 | catch (Exception e) {
|
---|
65 | System.err.println("Warning: Failed to parse record number: " + RecordCount);
|
---|
66 | ParseErrorCount++;
|
---|
67 | continue;
|
---|
68 | }
|
---|
69 | writer.write(record);
|
---|
70 | RecordCount++;
|
---|
71 | }
|
---|
72 | writer.close();
|
---|
73 |
|
---|
74 | //Print update
|
---|
75 | if(RecordCount%(NRecords * 50)==0)
|
---|
76 | System.err.print("\rProcessed " + RecordCount + " records");
|
---|
77 |
|
---|
78 | //Calculate MD5
|
---|
79 | HashCode hc = Files.hash(f, Hashing.md5());
|
---|
80 | String s = hc.toString();
|
---|
81 |
|
---|
82 | //Ensure target folder exists, then rename file to hash string
|
---|
83 | File target = new File(OutputPath+"/"+s.substring(0, 2)+"/"+s.substring(2)+".xml");
|
---|
84 | File parent = target.getParentFile();
|
---|
85 | if(!parent.exists() && !parent.mkdirs()){
|
---|
86 | throw new IllegalStateException("Couldn't create dir " + parent);
|
---|
87 | }
|
---|
88 | f.renameTo(target);
|
---|
89 | }
|
---|
90 | System.err.print("\rProcessed " + RecordCount + " records");
|
---|
91 | System.err.println("\nTime taken: " + (System.currentTimeMillis()-StartTime) + "ms");
|
---|
92 | }
|
---|
93 | }
|
---|