1 | import java.io.*;
|
---|
2 | import com.google.common.hash.*;
|
---|
3 | import com.google.common.io.Files;
|
---|
4 | import org.marc4j.*;
|
---|
5 | import org.marc4j.marc.Record;
|
---|
6 |
|
---|
7 | public class split {
|
---|
8 |
|
---|
9 | public static void main(String[] args) throws Exception
|
---|
10 | {
|
---|
11 | long StartTime = System.currentTimeMillis();
|
---|
12 |
|
---|
13 | //Default values for arguments
|
---|
14 | int NRecords = 250;
|
---|
15 | String OutputPath = "./out/";
|
---|
16 | String InputPath = "./NZDataFull.xml";
|
---|
17 |
|
---|
18 | //Incorrect number of arguments supplied
|
---|
19 | if(args.length!=4)
|
---|
20 | {
|
---|
21 | System.err.println("USAGE: java split [-n records_per_file] [-o output_path]");
|
---|
22 | return;
|
---|
23 | }
|
---|
24 | //read arguments
|
---|
25 | for(int i=0; i < args.length; i+=2)
|
---|
26 | {
|
---|
27 | if(args[i].equals("-n"))
|
---|
28 | NRecords = Integer.parseInt(args[i+1]);
|
---|
29 | else if(args[i].equals("-o"))
|
---|
30 | OutputPath = args[i+1];
|
---|
31 | }
|
---|
32 |
|
---|
33 | InputStream in;
|
---|
34 | try{
|
---|
35 | in = new FileInputStream(InputPath);
|
---|
36 | }
|
---|
37 | catch(Exception e){
|
---|
38 | System.err.println("./NZDataFull.xml does not exist");
|
---|
39 | return;
|
---|
40 | }
|
---|
41 |
|
---|
42 | MarcXmlReader reader = new MarcXmlReader(in);
|
---|
43 | String TempFilename = OutputPath + "/temp.xml";
|
---|
44 | File f;
|
---|
45 | int RecordCount = 0;
|
---|
46 |
|
---|
47 | while(reader.hasNext())
|
---|
48 | {
|
---|
49 | f = new File(TempFilename);
|
---|
50 | MarcWriter writer = new MarcXmlWriter(new FileOutputStream(f),true);
|
---|
51 | Record record;
|
---|
52 |
|
---|
53 | //Write segment of records to file
|
---|
54 | for(int i=0; (i<NRecords && reader.hasNext()); i++)
|
---|
55 | {
|
---|
56 | record = reader.next();
|
---|
57 | writer.write(record);
|
---|
58 | RecordCount++;
|
---|
59 | }
|
---|
60 | writer.close();
|
---|
61 |
|
---|
62 | //Print update
|
---|
63 | if(RecordCount%(NRecords * 50)==0)
|
---|
64 | System.err.print("\rProcessed " + RecordCount + " records");
|
---|
65 |
|
---|
66 | //Calculate MD5
|
---|
67 | HashCode hc = Files.hash(f, Hashing.md5());
|
---|
68 | String s = hc.toString();
|
---|
69 |
|
---|
70 | //Ensure target folder exists, then rename file to hash string
|
---|
71 | File target = new File(OutputPath+"/"+s.substring(0, 2)+"/"+s.substring(2)+".xml");
|
---|
72 | File parent = target.getParentFile();
|
---|
73 | if(!parent.exists() && !parent.mkdirs()){
|
---|
74 | throw new IllegalStateException("Couldn't create dir " + parent);
|
---|
75 | }
|
---|
76 | f.renameTo(target);
|
---|
77 | }
|
---|
78 | System.err.print("\rProcessed " + RecordCount + " records");
|
---|
79 | System.err.println("\nTime taken: " + (System.currentTimeMillis()-StartTime) + "ms");
|
---|
80 | }
|
---|
81 | }
|
---|