1 | import java.io.*;
|
---|
2 | import com.google.common.hash.*;
|
---|
3 | import com.google.common.io.Files;
|
---|
4 | import org.marc4j.*;
|
---|
5 | import org.marc4j.marc.Record;
|
---|
6 |
|
---|
7 | public class split {
|
---|
8 |
|
---|
9 | public static void main(String[] args) throws Exception
|
---|
10 | {
|
---|
11 | long StartTime = System.currentTimeMillis();
|
---|
12 | //Default values for arguments
|
---|
13 | int NRecords = 250;
|
---|
14 | String OutputPath = "./out/";
|
---|
15 |
|
---|
16 | //Incorrect number of arguments supplied
|
---|
17 | if(args.length!=5)
|
---|
18 | {
|
---|
19 | System.err.println("USAGE: java split [-n records_per_file] [-o output_path] input.xml");
|
---|
20 | System.err.println("DEFAULT: [-n 200] [-o ./out/]");
|
---|
21 | return;
|
---|
22 | }
|
---|
23 | //read arguments
|
---|
24 | for(int i=0; i < args.length; i+=2)
|
---|
25 | {
|
---|
26 | if(args[i].equals("-n"))
|
---|
27 | NRecords = Integer.parseInt(args[i+1]);
|
---|
28 | else if(args[i].equals("-o"))
|
---|
29 | OutputPath = args[i+1];
|
---|
30 | }
|
---|
31 |
|
---|
32 | InputStream in;
|
---|
33 | try{
|
---|
34 | in = new FileInputStream(args[args.length-1]);
|
---|
35 | }
|
---|
36 | catch(Exception e){
|
---|
37 | System.err.println("Input file doesn't exist");
|
---|
38 | return;
|
---|
39 | }
|
---|
40 |
|
---|
41 | MarcXmlReader reader = new MarcXmlReader(in);
|
---|
42 | String TempFilename = OutputPath + "/temp.xml";
|
---|
43 | File f;
|
---|
44 | int RecordCount = 0;
|
---|
45 |
|
---|
46 | while(reader.hasNext())
|
---|
47 | {
|
---|
48 | f = new File(TempFilename);
|
---|
49 | MarcWriter writer = new MarcXmlWriter(new FileOutputStream(f),true);
|
---|
50 |
|
---|
51 | Record record;
|
---|
52 | //Write segment of records to file
|
---|
53 | for(int i=0; (i<NRecords && reader.hasNext()); i++)
|
---|
54 | {
|
---|
55 | record = reader.next();
|
---|
56 | writer.write(record);
|
---|
57 | RecordCount++;
|
---|
58 | }
|
---|
59 | writer.close();
|
---|
60 |
|
---|
61 | //Print update
|
---|
62 | if(RecordCount%50000==0)
|
---|
63 | System.err.println("Processed " + RecordCount + " records");
|
---|
64 |
|
---|
65 | //Calculate MD5
|
---|
66 | HashCode hc = Files.hash(f, Hashing.md5());
|
---|
67 | String s = hc.toString();
|
---|
68 |
|
---|
69 | //Ensure target folder exists, then rename file to hash string
|
---|
70 | File target = new File(OutputPath+"/"+s.substring(0, 2)+"/"+s.substring(2)+".xml");
|
---|
71 | File parent = target.getParentFile();
|
---|
72 | if(!parent.exists() && !parent.mkdirs()){
|
---|
73 | throw new IllegalStateException("Couldn't create dir " + parent);
|
---|
74 | }
|
---|
75 | f.renameTo(target);
|
---|
76 | }
|
---|
77 | System.err.println("Processed " + RecordCount + " records");
|
---|
78 | System.err.println("Time taken: " + (System.currentTimeMillis()-StartTime) + "ms");
|
---|
79 | }
|
---|
80 | }
|
---|