source: other-projects/maori-lang-detection/hdfs-cc-work/patches/WATExtractorOutput.java@ 33814

Last change on this file since 33814 was 33541, checked in by ak19, 5 years ago
  1. hdfs-cc-work/GS_README.txt now contains the complete instructions to use Autistici crawl to download a website (as WARC file) as well as now also the instructions to convert those WARCs to WET. 2. Moved the first part out of MoreReading/crawling-Nutch.txt. 3. Adding patched WARC-to-WET files for the gitprojects ia-web-commons and ia-hadoop-tools to successfully do the WARC-to-WET processing on WARC files generated by Austistici crawl. (Worked on Dr Bainbridge's home page site as a test. Not tried any other site yet, as I wanted to get the work flow from crawl to WET working.)
File size: 7.3 KB
Line 
1package org.archive.extract;
2
3import java.io.ByteArrayInputStream;
4import java.io.ByteArrayOutputStream;
5import java.io.File;
6import java.io.IOException;
7import java.io.OutputStream;
8import java.io.OutputStreamWriter;
9import java.nio.charset.Charset;
10import java.text.ParseException;
11import java.net.UnknownHostException;
12import java.util.Date;
13
14import org.archive.format.gzip.GZIPMemberWriter;
15import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
16import org.archive.format.http.HttpHeaders;
17import org.archive.format.json.JSONUtils;
18import org.archive.format.warc.WARCRecordWriter;
19import org.archive.resource.MetaData;
20import org.archive.resource.Resource;
21import org.archive.util.IAUtils;
22import org.archive.util.DateUtils;
23import org.archive.util.StreamCopy;
24import org.archive.util.io.CommitedOutputStream;
25import com.github.openjson.JSONException;
26
27import java.net.InetAddress;
28import java.text.DateFormat;
29import java.text.SimpleDateFormat;
30
31import java.util.logging.Logger;
32
33public class WATExtractorOutput implements ExtractorOutput {
34 WARCRecordWriter recW;
35 private boolean wroteFirst;
36 private GZIPMemberWriter gzW;
37 private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
38 private int bufferRAM = DEFAULT_BUFFER_RAM;
39 private final static Charset UTF8 = Charset.forName("UTF-8");
40 private String outputFile;
41
42 private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
43
44 public WATExtractorOutput(OutputStream out) {
45 this(out, null);
46 }
47
48 public WATExtractorOutput(OutputStream out, String outputFile) {
49 gzW = new GZIPMemberWriter(out);
50 recW = new WARCRecordWriter();
51 wroteFirst = false;
52 this.outputFile = outputFile;
53 }
54
55 private CommitedOutputStream getOutput() {
56 return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
57 }
58
59 public void output(Resource resource) throws IOException {
60 StreamCopy.readToEOF(resource.getInputStream());
61 MetaData top = resource.getMetaData().getTopMetaData();
62 CommitedOutputStream cos;
63 if(!wroteFirst) {
64 cos = getOutput();
65 writeWARCInfo(cos,top);
66 cos.commit();
67 wroteFirst = true;
68 }
69 String envelopeFormat = JSONUtils.extractSingle(top, "Envelope.Format");
70 if(envelopeFormat == null) {
71 // hrm...
72 throw new IOException("Missing Envelope.Format");
73 }
74
75 // remove the text extracts if it exists
76 JSONUtils.removeObject(top, "Envelope.Payload-Metadata.HTTP-Response-Metadata.HTML-Metadata", "Text");
77
78 cos = getOutput();
79 if(envelopeFormat.startsWith("ARC")) {
80 writeARC(cos,top);
81 } else if(envelopeFormat.startsWith("WARC")) {
82 writeWARC(cos,top);
83 } else {
84 // hrm...
85 throw new IOException("Unknown Envelope.Format");
86 }
87 cos.commit();
88 }
89
90 private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
91 // filename is given in the command line
92 String filename = outputFile;
93 if (filename == null || filename.length() == 0) {
94 // if no filename by command line, we construct a default filename base on container filename
95 filename = JSONUtils.extractSingle(md, "Container.Filename");
96 if (filename == null) {
97 throw new IOException("No Container.Filename...");
98 }
99 if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) {
100 filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz");
101 filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz");
102 } else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) {
103 filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz");
104 filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz");
105 }
106 }
107 // removing path from filename
108 File tmpFile = new File(filename);
109 filename = tmpFile.getName();
110 HttpHeaders headers = new HttpHeaders();
111 headers.add("Software-Info", IAUtils.COMMONS_VERSION);
112 headers.addDateHeader("Extracted-Date", new Date());
113
114 // add ip, hostname
115 try {
116 InetAddress host = InetAddress.getLocalHost();
117 headers.add("ip", host.getHostAddress());
118 headers.add("hostname", host.getCanonicalHostName());
119 } catch (UnknownHostException e) {
120 LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage());
121 }
122
123 headers.add("format", IAUtils.WARC_FORMAT);
124 headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO);
125 // optional arguments
126 if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) {
127 headers.add("operator", IAUtils.OPERATOR);
128 }
129 if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) {
130 headers.add("publisher", IAUtils.PUBLISHER);
131 }
132 if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) {
133 headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION);
134 }
135
136 ByteArrayOutputStream baos = new ByteArrayOutputStream();
137 headers.write(baos);
138 recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
139 }
140
141 private String extractOrIO(MetaData md, String path) throws IOException {
142 String value = JSONUtils.extractSingle(md, path);
143 if(value == null) {
144 throw new IOException("No "+path+" found.");
145 }
146 return value;
147 }
148
149 private void writeARC(OutputStream recOut, MetaData md) throws IOException {
150 String targetURI = extractOrIO(md, "Envelope.ARC-Header-Metadata.Target-URI");
151 String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date");
152 String filename = extractOrIO(md, "Container.Filename");
153 String offset = extractOrIO(md, "Container.Offset");
154 String recId = String.format("<urn:arc:%s:%s>",filename,offset);
155 writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
156 }
157
158 private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
159 String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
160 String targetURI;
161 if(warcType.equals("warcinfo")) {
162 targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Warcinfo-ID");
163 } else {
164 targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
165 }
166 // handle date of generation in WARC format
167 DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
168 String capDateString = dateFormat.format(new Date());
169 String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
170 writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
171 }
172
173 private void writeWARCMDRecord(OutputStream recOut, MetaData md,
174 String targetURI, String capDateString, String recId)
175 throws IOException {
176
177 ByteArrayOutputStream bos = new ByteArrayOutputStream();
178
179 OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8);
180 String contents = md.toString();
181 osw.write(contents, 0, contents.length());
182 osw.flush();
183// ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes("UTF-8"));
184 Date capDate;
185 try {
186 capDate = DateUtils.getSecondsSinceEpoch(capDateString);
187
188 } catch (ParseException e) {
189 e.printStackTrace();
190 // TODO... not the write thing...
191 capDate = new Date();
192 }
193
194 recW.writeJSONMetadataRecord(recOut, bos.toByteArray(),
195 targetURI, capDate, recId);
196 }
197
198 private static String transformWARCDate(final String input) {
199
200 StringBuilder output = new StringBuilder(14);
201
202 output.append(input.substring(0,4));
203 output.append(input.substring(5,7));
204 output.append(input.substring(8,10));
205 output.append(input.substring(11,13));
206 output.append(input.substring(14,16));
207 output.append(input.substring(17,19));
208
209 return output.toString();
210 }
211}
Note: See TracBrowser for help on using the repository browser.