source: main/trunk/model-sites-dev/von-sparql/collect/nz-natlib-cat/pre-import/marcXML_Split/src/split.java@ 28794

Last change on this file since 28794 was 28794, checked in by ak19, 10 years ago

Encoding issue now straightened out

File size: 2.4 KB
Line 
1import java.io.*;
2import com.google.common.hash.*;
3import com.google.common.io.Files;
4import org.marc4j.*;
5import org.marc4j.marc.Record;
6
7public class split {
8
9 public static void main(String[] args) throws Exception
10 {
11 long StartTime = System.currentTimeMillis();
12
13 //Default values for arguments
14 int NRecords = 250;
15 String OutputPath = "./out/nz_out";
16 String InputPath = "./NZDataFull.xml";
17 //String InputPath = "./NZ-small-macron-test.xml";
18
19 //Incorrect number of arguments supplied
20 if(args.length!=2)
21 {
22 System.err.println("USAGE: ./RUN [records_per_file] [output_path]");
23 return;
24 }
25
26 try{
27 NRecords = Integer.parseInt(args[0]);
28 OutputPath = args[1];
29 }
30 catch(Exception e)
31 {
32 System.err.println("Invalid Arguments");
33 System.err.println("USAGE: ./RUN [records_per_file] [output_path]");
34 return;
35 }
36
37 InputStream in;
38 try{
39 in = new FileInputStream(InputPath);
40 }
41 catch(Exception e){
42 System.err.println("./NZDataFull.xml does not exist");
43 return;
44 }
45
46 MarcXmlReader reader = new MarcXmlReader(in);
47 String TempFilename = OutputPath + "/temp.xml";
48 File f;
49 int RecordCount = 0;
50 int ParseErrorCount = 0;
51
52 while(reader.hasNext())
53 {
54 f = new File(TempFilename);
55 MarcWriter writer = new MarcXmlWriter(new FileOutputStream(f),"UTF8",true);
56 Record record;
57
58 //Write segment of records to file
59 for(int i=0; (i<NRecords && reader.hasNext()); i++)
60 {
61 try {
62 record = reader.next();
63 }
64 catch (Exception e) {
65 System.err.println("Warning: Failed to parse record number: " + RecordCount);
66 ParseErrorCount++;
67 continue;
68 }
69 writer.write(record);
70 RecordCount++;
71 }
72 writer.close();
73
74 //Print update
75 if(RecordCount%(NRecords * 50)==0)
76 System.err.print("\rProcessed " + RecordCount + " records");
77
78 //Calculate MD5
79 HashCode hc = Files.hash(f, Hashing.md5());
80 String s = hc.toString();
81
82 //Ensure target folder exists, then rename file to hash string
83 File target = new File(OutputPath+"/"+s.substring(0, 2)+"/"+s.substring(2)+".xml");
84 File parent = target.getParentFile();
85 if(!parent.exists() && !parent.mkdirs()){
86 throw new IllegalStateException("Couldn't create dir " + parent);
87 }
88 f.renameTo(target);
89 }
90 System.err.print("\rProcessed " + RecordCount + " records");
91 System.err.println("\nTime taken: " + (System.currentTimeMillis()-StartTime) + "ms");
92 }
93}
Note: See TracBrowser for help on using the repository browser.