1 | package org.archive.extract;
|
---|
2 |
|
---|
3 | import java.io.ByteArrayInputStream;
|
---|
4 | import java.io.ByteArrayOutputStream;
|
---|
5 | import java.io.File;
|
---|
6 | import java.io.IOException;
|
---|
7 | import java.io.OutputStream;
|
---|
8 | import java.io.OutputStreamWriter;
|
---|
9 | import java.nio.charset.Charset;
|
---|
10 | import java.text.ParseException;
|
---|
11 | import java.net.UnknownHostException;
|
---|
12 | import java.util.Date;
|
---|
13 |
|
---|
14 | import org.archive.format.gzip.GZIPMemberWriter;
|
---|
15 | import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
|
---|
16 | import org.archive.format.http.HttpHeaders;
|
---|
17 | import org.archive.format.json.JSONUtils;
|
---|
18 | import org.archive.format.warc.WARCRecordWriter;
|
---|
19 | import org.archive.resource.MetaData;
|
---|
20 | import org.archive.resource.Resource;
|
---|
21 | import org.archive.util.IAUtils;
|
---|
22 | import org.archive.util.DateUtils;
|
---|
23 | import org.archive.util.StreamCopy;
|
---|
24 | import org.archive.util.io.CommitedOutputStream;
|
---|
25 | import com.github.openjson.JSONException;
|
---|
26 |
|
---|
27 | import java.net.InetAddress;
|
---|
28 | import java.text.DateFormat;
|
---|
29 | import java.text.SimpleDateFormat;
|
---|
30 |
|
---|
31 | import java.util.logging.Logger;
|
---|
32 |
|
---|
33 | public class WATExtractorOutput implements ExtractorOutput {
|
---|
34 | WARCRecordWriter recW;
|
---|
35 | private boolean wroteFirst;
|
---|
36 | private GZIPMemberWriter gzW;
|
---|
37 | private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
|
---|
38 | private int bufferRAM = DEFAULT_BUFFER_RAM;
|
---|
39 | private final static Charset UTF8 = Charset.forName("UTF-8");
|
---|
40 | private String outputFile;
|
---|
41 |
|
---|
42 | private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
|
---|
43 |
|
---|
44 | public WATExtractorOutput(OutputStream out) {
|
---|
45 | this(out, null);
|
---|
46 | }
|
---|
47 |
|
---|
48 | public WATExtractorOutput(OutputStream out, String outputFile) {
|
---|
49 | gzW = new GZIPMemberWriter(out);
|
---|
50 | recW = new WARCRecordWriter();
|
---|
51 | wroteFirst = false;
|
---|
52 | this.outputFile = outputFile;
|
---|
53 | }
|
---|
54 |
|
---|
55 | private CommitedOutputStream getOutput() {
|
---|
56 | return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
|
---|
57 | }
|
---|
58 |
|
---|
59 | public void output(Resource resource) throws IOException {
|
---|
60 | StreamCopy.readToEOF(resource.getInputStream());
|
---|
61 | MetaData top = resource.getMetaData().getTopMetaData();
|
---|
62 | CommitedOutputStream cos;
|
---|
63 | if(!wroteFirst) {
|
---|
64 | cos = getOutput();
|
---|
65 | writeWARCInfo(cos,top);
|
---|
66 | cos.commit();
|
---|
67 | wroteFirst = true;
|
---|
68 | }
|
---|
69 | String envelopeFormat = JSONUtils.extractSingle(top, "Envelope.Format");
|
---|
70 | if(envelopeFormat == null) {
|
---|
71 | // hrm...
|
---|
72 | throw new IOException("Missing Envelope.Format");
|
---|
73 | }
|
---|
74 |
|
---|
75 | // remove the text extracts if it exists
|
---|
76 | JSONUtils.removeObject(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata", "Text");
|
---|
77 |
|
---|
78 | cos = getOutput();
|
---|
79 | if(envelopeFormat.startsWith("ARC")) {
|
---|
80 | writeARC(cos,top);
|
---|
81 | } else if(envelopeFormat.startsWith("WARC")) {
|
---|
82 | writeWARC(cos,top);
|
---|
83 | } else {
|
---|
84 | // hrm...
|
---|
85 | throw new IOException("Unknown Envelope.Format");
|
---|
86 | }
|
---|
87 | cos.commit();
|
---|
88 | }
|
---|
89 |
|
---|
90 | private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
|
---|
91 | // filename is given in the command line
|
---|
92 | String filename = outputFile;
|
---|
93 | if (filename == null || filename.length() == 0) {
|
---|
94 | // if no filename by command line, we construct a default filename base on container filename
|
---|
95 | filename = JSONUtils.extractSingle(md, "Container.Filename");
|
---|
96 | if (filename == null) {
|
---|
97 | throw new IOException("No Container.Filename...");
|
---|
98 | }
|
---|
99 | if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) {
|
---|
100 | filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz");
|
---|
101 | filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz");
|
---|
102 | } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) {
|
---|
103 | filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz");
|
---|
104 | filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz");
|
---|
105 | }
|
---|
106 | }
|
---|
107 | // removing path from filename
|
---|
108 | File tmpFile = new File(filename);
|
---|
109 | filename = tmpFile.getName();
|
---|
110 | HttpHeaders headers = new HttpHeaders();
|
---|
111 | headers.add("Software-Info", IAUtils.COMMONS_VERSION);
|
---|
112 | headers.addDateHeader("Extracted-Date", new Date());
|
---|
113 |
|
---|
114 | // add ip, hostname
|
---|
115 | try {
|
---|
116 | InetAddress host = InetAddress.getLocalHost();
|
---|
117 | headers.add("ip", host.getHostAddress());
|
---|
118 | headers.add("hostname", host.getCanonicalHostName());
|
---|
119 | } catch (UnknownHostException e) {
|
---|
120 | LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage());
|
---|
121 | }
|
---|
122 |
|
---|
123 | headers.add("format", IAUtils.WARC_FORMAT);
|
---|
124 | headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO);
|
---|
125 | // optional arguments
|
---|
126 | if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) {
|
---|
127 | headers.add("operator", IAUtils.OPERATOR);
|
---|
128 | }
|
---|
129 | if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) {
|
---|
130 | headers.add("publisher", IAUtils.PUBLISHER);
|
---|
131 | }
|
---|
132 | if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) {
|
---|
133 | headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION);
|
---|
134 | }
|
---|
135 |
|
---|
136 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
---|
137 | headers.write(baos);
|
---|
138 | recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
|
---|
139 | }
|
---|
140 |
|
---|
141 | private String extractOrIO(MetaData md, String path) throws IOException {
|
---|
142 | String value = JSONUtils.extractSingle(md, path);
|
---|
143 | if(value == null) {
|
---|
144 | throw new IOException("No "+path+" found.");
|
---|
145 | }
|
---|
146 | return value;
|
---|
147 | }
|
---|
148 |
|
---|
149 | private void writeARC(OutputStream recOut, MetaData md) throws IOException {
|
---|
150 | String targetURI = extractOrIO(md, "Envelope.ARC-Header-Metadata.Target-URI");
|
---|
151 | String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date");
|
---|
152 | String filename = extractOrIO(md, "Container.Filename");
|
---|
153 | String offset = extractOrIO(md, "Container.Offset");
|
---|
154 | String recId = String.format("<urn:arc:%s:%s>",filename,offset);
|
---|
155 | writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
|
---|
156 | }
|
---|
157 |
|
---|
158 | private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
|
---|
159 | String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
|
---|
160 | String targetURI;
|
---|
161 | if(warcType.equals("warcinfo")) {
|
---|
162 | targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Warcinfo-ID");
|
---|
163 | } else {
|
---|
164 | targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
|
---|
165 | }
|
---|
166 | // handle date of generation in WARC format
|
---|
167 | DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
|
---|
168 | String capDateString = dateFormat.format(new Date());
|
---|
169 | String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
|
---|
170 | writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
|
---|
171 | }
|
---|
172 |
|
---|
173 | private void writeWARCMDRecord(OutputStream recOut, MetaData md,
|
---|
174 | String targetURI, String capDateString, String recId)
|
---|
175 | throws IOException {
|
---|
176 |
|
---|
177 | ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
---|
178 |
|
---|
179 | OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8);
|
---|
180 | String contents = md.toString();
|
---|
181 | osw.write(contents, 0, contents.length());
|
---|
182 | osw.flush();
|
---|
183 | // ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes("UTF-8"));
|
---|
184 | Date capDate;
|
---|
185 | try {
|
---|
186 | capDate = DateUtils.getSecondsSinceEpoch(capDateString);
|
---|
187 |
|
---|
188 | } catch (ParseException e) {
|
---|
189 | e.printStackTrace();
|
---|
190 | // TODO... not the write thing...
|
---|
191 | capDate = new Date();
|
---|
192 | }
|
---|
193 |
|
---|
194 | recW.writeJSONMetadataRecord(recOut, bos.toByteArray(),
|
---|
195 | targetURI, capDate, recId);
|
---|
196 | }
|
---|
197 |
|
---|
198 | private static String transformWARCDate(final String input) {
|
---|
199 |
|
---|
200 | StringBuilder output = new StringBuilder(14);
|
---|
201 |
|
---|
202 | output.append(input.substring(0,4));
|
---|
203 | output.append(input.substring(5,7));
|
---|
204 | output.append(input.substring(8,10));
|
---|
205 | output.append(input.substring(11,13));
|
---|
206 | output.append(input.substring(14,16));
|
---|
207 | output.append(input.substring(17,19));
|
---|
208 |
|
---|
209 | return output.toString();
|
---|
210 | }
|
---|
211 | }
|
---|