1 | /** jmt12 **/
|
---|
2 | package org.nzdl.gsdl;
|
---|
3 |
|
---|
4 | import org.nzdl.gsdl.GSGroupingComparator;
|
---|
5 | import org.nzdl.gsdl.GSInfoDB;
|
---|
6 | import org.nzdl.gsdl.GSPartitioner;
|
---|
7 |
|
---|
8 | import java.io.*;
|
---|
9 | import java.lang.Iterable;
|
---|
10 | import java.lang.ProcessBuilder;
|
---|
11 | import java.lang.ProcessBuilder.*;
|
---|
12 | import java.lang.Thread;
|
---|
13 | import java.net.InetAddress;
|
---|
14 | import java.nio.channels.FileChannel;
|
---|
15 | import java.util.Collection;
|
---|
16 | import java.util.Iterator;
|
---|
17 | import java.util.List;
|
---|
18 | import java.util.Map;
|
---|
19 | import java.util.StringTokenizer;
|
---|
20 | import java.util.regex.Matcher;
|
---|
21 | import java.util.regex.Pattern;
|
---|
22 |
|
---|
23 | import org.apache.hadoop.fs.Path;
|
---|
24 | import org.apache.hadoop.conf.*;
|
---|
25 | import org.apache.hadoop.io.*;
|
---|
26 | import org.apache.hadoop.mapred.ClusterStatus;
|
---|
27 | import org.apache.hadoop.mapred.JobClient;
|
---|
28 | import org.apache.hadoop.mapred.JobConf;
|
---|
29 | import org.apache.hadoop.mapreduce.*;
|
---|
30 | import org.apache.hadoop.mapreduce.Mapper.Context;
|
---|
31 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
---|
32 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
---|
33 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
---|
34 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
---|
35 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
|
---|
36 | import org.apache.hadoop.util.*;
|
---|
37 |
|
---|
38 | /** @class WordCount
|
---|
39 | */
|
---|
40 | public class HadoopGreenstoneIngest2
|
---|
41 | {
|
---|
42 |
|
---|
43 | /** @class GSFileRecordReader
|
---|
44 | */
|
---|
45 | public static class GSFileRecordReader
|
---|
46 | extends RecordReader<Text, Text>
|
---|
47 | {
|
---|
48 | /** Uncompressed file name */
|
---|
49 | private Text current_key;
|
---|
50 |
|
---|
51 | private Text current_value = new Text("");
|
---|
52 |
|
---|
53 | /** Used to indicate progress */
|
---|
54 | private boolean is_finished = false;
|
---|
55 |
|
---|
56 | /**
|
---|
57 | */
|
---|
58 | @Override
|
---|
59 | public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
|
---|
60 | throws IOException, InterruptedException
|
---|
61 | {
|
---|
62 | FileSplit split = (FileSplit) inputSplit;
|
---|
63 | current_key = new Text(split.getPath().toString());
|
---|
64 | }
|
---|
65 | /** initialize() **/
|
---|
66 |
|
---|
67 | /**
|
---|
68 | * We only ever have a single key/value
|
---|
69 | */
|
---|
70 | @Override
|
---|
71 | public boolean nextKeyValue()
|
---|
72 | throws IOException, InterruptedException
|
---|
73 | {
|
---|
74 | if (!is_finished)
|
---|
75 | {
|
---|
76 | is_finished = true;
|
---|
77 | return true;
|
---|
78 | }
|
---|
79 | return false;
|
---|
80 | }
|
---|
81 | /** nextKeyValue() **/
|
---|
82 |
|
---|
83 | /** @function getProgress
|
---|
84 | * Rather than calculating progress, we just keep it simple
|
---|
85 | */
|
---|
86 | @Override
|
---|
87 | public float getProgress()
|
---|
88 | throws IOException, InterruptedException
|
---|
89 | {
|
---|
90 | return is_finished ? 1 : 0;
|
---|
91 | }
|
---|
92 | /** getProgress() **/
|
---|
93 |
|
---|
94 | /**
|
---|
95 | * Returns the current key (name of the zipped file)
|
---|
96 | */
|
---|
97 | @Override
|
---|
98 | public Text getCurrentKey()
|
---|
99 | throws IOException, InterruptedException
|
---|
100 | {
|
---|
101 | return current_key;
|
---|
102 | }
|
---|
103 | /** getCurrentKey() **/
|
---|
104 |
|
---|
105 | /**
|
---|
106 | * Returns the current value (contents of the zipped file)
|
---|
107 | */
|
---|
108 | @Override
|
---|
109 | public Text getCurrentValue()
|
---|
110 | throws IOException, InterruptedException
|
---|
111 | {
|
---|
112 | return current_value;
|
---|
113 | }
|
---|
114 | /** getCurrentValue() **/
|
---|
115 |
|
---|
116 | /**
|
---|
117 | * Close quietly, ignoring any exceptions
|
---|
118 | */
|
---|
119 | @Override
|
---|
120 | public void close()
|
---|
121 | throws IOException
|
---|
122 | {
|
---|
123 | // nothing to do
|
---|
124 | }
|
---|
125 | /** close() **/
|
---|
126 |
|
---|
127 | }
|
---|
128 | /** GSFileRecordReader **/
|
---|
129 |
|
---|
130 | /** @class GSFileInputFormat
|
---|
131 | */
|
---|
132 | public static class GSFileInputFormat
|
---|
133 | extends FileInputFormat<Text, Text>
|
---|
134 | {
|
---|
135 |
|
---|
136 | private String[] getActiveServersList(JobContext context)
|
---|
137 | {
|
---|
138 | String [] servers = null;
|
---|
139 | try
|
---|
140 | {
|
---|
141 | JobClient jc = new JobClient((JobConf)context.getConfiguration());
|
---|
142 | ClusterStatus status = jc.getClusterStatus(true);
|
---|
143 | Collection<String> atc = status.getActiveTrackerNames();
|
---|
144 | servers = new String[atc.size()];
|
---|
145 | int s = 0;
|
---|
146 | for (String serverInfo : atc)
|
---|
147 | {
|
---|
148 | StringTokenizer st = new StringTokenizer(serverInfo, ":");
|
---|
149 | String trackerName = st.nextToken();
|
---|
150 | StringTokenizer st1 = new StringTokenizer(trackerName, "_");
|
---|
151 | st1.nextToken();
|
---|
152 | servers[s++] = st1.nextToken();
|
---|
153 | }
|
---|
154 | }
|
---|
155 | catch (IOException e)
|
---|
156 | {
|
---|
157 | e.printStackTrace();
|
---|
158 | }
|
---|
159 | System.err.print("Servers: ");
|
---|
160 | String sep = "";
|
---|
161 | for (Object obj : servers)
|
---|
162 | {
|
---|
163 | System.err.print(sep + obj.toString());
|
---|
164 | sep = ", ";
|
---|
165 | }
|
---|
166 | System.err.println("");
|
---|
167 | return servers;
|
---|
168 | }
|
---|
169 | /** getActiveServersList() **/
|
---|
170 |
|
---|
171 | /**
|
---|
172 | */
|
---|
173 | public List<InputSplit> getSplits(JobContext job)
|
---|
174 | throws IOException
|
---|
175 | {
|
---|
176 | System.err.println("GSFileInputFormat::getSplits()");
|
---|
177 | // get splits
|
---|
178 | List<InputSplit> original_splits = super.getSplits(job);
|
---|
179 | // Get active servers
|
---|
180 | String[] servers = getActiveServersList(job);
|
---|
181 | if(servers == null)
|
---|
182 | {
|
---|
183 | return null;
|
---|
184 | }
|
---|
185 | // done
|
---|
186 | System.err.println("Splits: ");
|
---|
187 | for (InputSplit obj : original_splits)
|
---|
188 | {
|
---|
189 | System.err.println(obj.toString());
|
---|
190 | }
|
---|
191 | return original_splits;
|
---|
192 | }
|
---|
193 | /** getSplits() **/
|
---|
194 |
|
---|
195 | /**
|
---|
196 | * Don't split the files
|
---|
197 | */
|
---|
198 | @Override
|
---|
199 | protected boolean isSplitable(JobContext context, Path filename)
|
---|
200 | {
|
---|
201 | return false;
|
---|
202 | }
|
---|
203 | /** isSplitable() **/
|
---|
204 |
|
---|
205 | /**
|
---|
206 | */
|
---|
207 | @Override
|
---|
208 | public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext content)
|
---|
209 | throws IOException, InterruptedException
|
---|
210 | {
|
---|
211 | return new GSFileRecordReader();
|
---|
212 | }
|
---|
213 | /** createRecordReader() **/
|
---|
214 |
|
---|
215 | }
|
---|
216 | /** class GSFileInputFormat **/
|
---|
217 |
|
---|
218 | /** @class GSMap
|
---|
219 | */
|
---|
220 | public static class GSMap
|
---|
221 | extends Mapper<Text, Text, Text, Text>
|
---|
222 | {
|
---|
223 | /** @function map
|
---|
224 | * The key is the full path (HDFS) of the file to be processed.
|
---|
225 | */
|
---|
226 | public void map(Text key, Text value, Context context)
|
---|
227 | throws IOException, InterruptedException
|
---|
228 | {
|
---|
229 | String file_path = key.toString();
|
---|
230 | // - configuration for the task
|
---|
231 | Configuration conf = context.getConfiguration();
|
---|
232 | String gsdlhome = conf.get("gsdlhome");
|
---|
233 | String hdfs_prefix = conf.get("hdfsprefix");
|
---|
234 | String hadoop_home = conf.get("hadoophome");
|
---|
235 | String collection = conf.get("collection");
|
---|
236 | String task_id = conf.get("mapred.task.id");
|
---|
237 | task_id = task_id.substring(8); // remove "attempt_" prefix
|
---|
238 |
|
---|
239 | // Programatically rewrite the protocol as appropriate for the given
|
---|
240 | // archives directory (not necessary if path is local or NFS)
|
---|
241 | if (hdfs_prefix.equals("/hdfs"))
|
---|
242 | {
|
---|
243 | file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix);
|
---|
244 | }
|
---|
245 | else
|
---|
246 | {
|
---|
247 | file_path = file_path.replace("hdfs://", hdfs_prefix);
|
---|
248 | }
|
---|
249 |
|
---|
250 | // - create a temporary directory
|
---|
251 | File greenstone_tmp_dir = new File("/tmp/greenstone");
|
---|
252 | if (!greenstone_tmp_dir.isDirectory())
|
---|
253 | {
|
---|
254 | greenstone_tmp_dir.mkdir();
|
---|
255 | }
|
---|
256 |
|
---|
257 | // - open a unique log file
|
---|
258 | File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
|
---|
259 | FileWriter fw1 = new FileWriter(import_process_log, true);
|
---|
260 | // MEDUSA Customization: Introduce a slight delay based upon the hostname
|
---|
261 | // in order to stagger the startup of Map workers. It looks like the avg
|
---|
262 | // IO is around 25 minutes... so lets try to make it so the last mapper
|
---|
263 | // starts up 25 minutes after the first (with all others spread in
|
---|
264 | // between).
|
---|
265 | String hostname = InetAddress.getLocalHost().getHostName();
|
---|
266 | // We only do this if there is a sentinel file lurking in tmp
|
---|
267 | try
|
---|
268 | {
|
---|
269 | File delay_file = new File("/tmp/greenstone/delay.me");
|
---|
270 | if (delay_file.exists())
|
---|
271 | {
|
---|
272 | Pattern p = Pattern.compile("compute-0-([0-9]+).local");
|
---|
273 | Matcher m = p.matcher(hostname);
|
---|
274 | if (m.matches())
|
---|
275 | {
|
---|
276 | String node_str = m.group(1);
|
---|
277 | int node_number = Integer.parseInt(node_str) * 100;
|
---|
278 | fw1.write("[DEBUG] Delaying start for " + node_number + " seconds");
|
---|
279 | Thread.currentThread().sleep(1000 * node_number);
|
---|
280 | }
|
---|
281 | // We only do this once for each compute node
|
---|
282 | delay_file.delete();
|
---|
283 | }
|
---|
284 | }
|
---|
285 | catch (Exception ie)
|
---|
286 | {
|
---|
287 | System.err.println(ie.toString());
|
---|
288 | }
|
---|
289 |
|
---|
290 | // - start the log by writing the time and the manifest line
|
---|
291 | double start_time = ((double)System.currentTimeMillis())/1000;
|
---|
292 | StringBuffer header_block = new StringBuffer("[Started:");
|
---|
293 | header_block.append(String.format("%.6f", start_time));
|
---|
294 | header_block.append("]\n[Host:");
|
---|
295 | header_block.append(hostname);
|
---|
296 | header_block.append("]\n[CPU:");
|
---|
297 | String getcpu_executable_cmd = gsdlhome + "/ext/parallel-building/linux/bin/getcpu";
|
---|
298 | File getcpu_executable = new File(getcpu_executable_cmd);
|
---|
299 | String cpu_number = "0";
|
---|
300 | if (getcpu_executable.exists())
|
---|
301 | {
|
---|
302 | cpu_number = runCommand(getcpu_executable_cmd);
|
---|
303 | }
|
---|
304 | header_block.append(cpu_number);
|
---|
305 | header_block.append("]\n[Task:");
|
---|
306 | header_block.append(task_id);
|
---|
307 | header_block.append("]\n[Map:");
|
---|
308 | header_block.append(file_path);
|
---|
309 | header_block.append("=>");
|
---|
310 | header_block.append(value);
|
---|
311 | header_block.append("]\n");
|
---|
312 | fw1.write(header_block.toString());
|
---|
313 | header_block = null;
|
---|
314 |
|
---|
315 | // - create a temporary manifest file to process this file. Overwrite any
|
---|
316 | // existing file
|
---|
317 | File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
|
---|
318 | FileWriter manifest_writer = new FileWriter(manifest_path);
|
---|
319 | manifest_writer.write("<Manifest version=\"2.0\">\n");
|
---|
320 | manifest_writer.write("\t<Index>\n");
|
---|
321 | manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
|
---|
322 | manifest_writer.write("\t</Index>\n");
|
---|
323 | manifest_writer.write("</Manifest>\n");
|
---|
324 | manifest_writer.close();
|
---|
325 |
|
---|
326 | /* Original process calling - sets up environment in Java
|
---|
327 | // - call Greenstone passing in the path to the manifest
|
---|
328 | ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
|
---|
329 | fw1.write("[Command:" + import_process_builder.command() + "]\n");
|
---|
330 | // - alter environment
|
---|
331 | Map<String, String> import_process_env = import_process_builder.environment();
|
---|
332 | // - path
|
---|
333 | String path = import_process_env.get("PATH");
|
---|
334 | path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
|
---|
335 | path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
|
---|
336 | path = hadoop_home + "/bin:" + path;
|
---|
337 | path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
|
---|
338 | path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
|
---|
339 | path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
|
---|
340 | path = gsdlhome + "/bin/script:" + path;
|
---|
341 | path = gsdlhome + "/bin/linux:" + path;
|
---|
342 | import_process_env.put("PATH", path);
|
---|
343 | fw1.write("[PATH: " + path + "]\n");
|
---|
344 | // - ld_library_path
|
---|
345 | import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
|
---|
346 | // - dyld_library_path
|
---|
347 | import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
|
---|
348 | // - misc
|
---|
349 | import_process_env.put("GSDLHOME", gsdlhome);
|
---|
350 | import_process_env.put("GSDLOS", "linux");
|
---|
351 | import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
|
---|
352 | // - installed extension paths
|
---|
353 | import_process_env.put("GEXTPARALLELBUILDING", gsdlhome + "/ext/parallel-building");
|
---|
354 | import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
|
---|
355 | import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
|
---|
356 | import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
|
---|
357 | // - Hadoop specific
|
---|
358 | import_process_env.put("HADOOP_PREFIX", hadoop_home);
|
---|
359 | fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
|
---|
360 | */
|
---|
361 |
|
---|
362 | /* New process call - adds call to setup.bash first to prepare
|
---|
363 | * environment... hopefully */
|
---|
364 | // - call Greenstone passing in the path to the manifest
|
---|
365 | String environment_script_filename = "setup.bash";
|
---|
366 | StringBuffer cmd_buffer = new StringBuffer();
|
---|
367 | cmd_buffer.append("source ./");
|
---|
368 | cmd_buffer.append(environment_script_filename);
|
---|
369 | cmd_buffer.append(" && time -p import.pl -keepold -manifest \"");
|
---|
370 | cmd_buffer.append(manifest_path.toString());
|
---|
371 | cmd_buffer.append("\" -archivedir \"");
|
---|
372 | cmd_buffer.append(conf.get("archivesdir"));
|
---|
373 | cmd_buffer.append("\" ");
|
---|
374 | cmd_buffer.append(collection);
|
---|
375 | ProcessBuilder import_process_builder = new ProcessBuilder("bash", "-c", cmd_buffer.toString());
|
---|
376 | fw1.write("[Command:" + import_process_builder.command() + "]\n");
|
---|
377 |
|
---|
378 | // - change working directory
|
---|
379 | import_process_builder.directory(new File(gsdlhome));
|
---|
380 |
|
---|
381 | // - redirect STDERR to STDOUT for simplicity sake
|
---|
382 | import_process_builder.redirectErrorStream(true);
|
---|
383 | // Obsolete command to send output to file
|
---|
384 | //import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
|
---|
385 |
|
---|
386 | // - create progress reporter (so Hadoop doesn't time us out)
|
---|
387 | Thread reporter = new HadoopProgressReporter(context, import_process_log);
|
---|
388 | reporter.start();
|
---|
389 |
|
---|
390 | // - run process
|
---|
391 | Process import_process = import_process_builder.start();
|
---|
392 | BufferedReader import_process_br = new BufferedReader(new InputStreamReader(import_process.getInputStream()));
|
---|
393 | String line = "";
|
---|
394 | Pattern open_tag_pattern = Pattern.compile("<InfoDBEntry type=\"(.+?)\" key=\"(.+?)\" mode=\"(.+?)\" timestamp=\"(.+?)\">(.*?(</InfoDBEntry>)?)");
|
---|
395 | Pattern close_tag_pattern = Pattern.compile("(.*?)</InfoDBEntry>");
|
---|
396 | while ((line = import_process_br.readLine()) != null)
|
---|
397 | {
|
---|
398 | // Write line to process log regardless
|
---|
399 | fw1.write(line + "\n");
|
---|
400 | // Now we check for sentinel strings in the output line
|
---|
401 | Text output_key;
|
---|
402 | Text output_value;
|
---|
403 | // Watch for open entry tags
|
---|
404 | Matcher open_tag_matcher = open_tag_pattern.matcher(line);
|
---|
405 | if(open_tag_matcher.matches())
|
---|
406 | {
|
---|
407 | String entry_type = open_tag_matcher.group(1);
|
---|
408 | String entry_key = open_tag_matcher.group(2);
|
---|
409 | String entry_mode = open_tag_matcher.group(3);
|
---|
410 | String entry_time = open_tag_matcher.group(4);
|
---|
411 | String payload = open_tag_matcher.group(5);
|
---|
412 | StringBuffer line_buffer = new StringBuffer();
|
---|
413 | // Continue until we've found the close tag - or run out of output log
|
---|
414 | Matcher close_tag_matcher = close_tag_pattern.matcher(payload);
|
---|
415 | while (!close_tag_matcher.matches() && (line = import_process_br.readLine()) != null)
|
---|
416 | {
|
---|
417 | // append any existing payload to the buffer
|
---|
418 | if (line_buffer.length() > 0)
|
---|
419 | {
|
---|
420 | line_buffer.append("\n");
|
---|
421 | }
|
---|
422 | line_buffer.append(payload);
|
---|
423 | // store this line as the payload, should the match below fail
|
---|
424 | payload = line;
|
---|
425 | close_tag_matcher = close_tag_pattern.matcher(line);
|
---|
426 | }
|
---|
427 | // We've found the close tag (hopefully) so add last bit (possibly
|
---|
428 | // empty string) to value
|
---|
429 | if (close_tag_matcher.matches())
|
---|
430 | {
|
---|
431 | String last_payload = close_tag_matcher.group(1);
|
---|
432 | if (line_buffer.length() > 0)
|
---|
433 | {
|
---|
434 | line_buffer.append("\n");
|
---|
435 | }
|
---|
436 | line_buffer.append(last_payload);
|
---|
437 | last_payload = null;
|
---|
438 | }
|
---|
439 | close_tag_matcher = null;
|
---|
440 |
|
---|
441 | // Construct the compound key by which to sort the entries - note
|
---|
442 | // that src is a little different than the others, as we don't care
|
---|
443 | // about the timestamp at all - instead wanting to group the entries
|
---|
444 | // by src file path (key)
|
---|
445 | if (entry_type.equals("src"))
|
---|
446 | {
|
---|
447 | output_key = new Text(entry_type + " " + entry_key);
|
---|
448 | }
|
---|
449 | else
|
---|
450 | {
|
---|
451 | output_key = new Text(entry_type + " " + entry_time);
|
---|
452 | }
|
---|
453 | // Doc has its payload prefixed by key
|
---|
454 | if (entry_type.equals("doc"))
|
---|
455 | {
|
---|
456 | String encoded_xml = line_buffer.toString();
|
---|
457 | String decoded_xml = encoded_xml.replace(""","\"");
|
---|
458 | decoded_xml = decoded_xml.replace("'","'");
|
---|
459 | decoded_xml = decoded_xml.replace("<","<");
|
---|
460 | decoded_xml = decoded_xml.replace(">",">");
|
---|
461 | decoded_xml = decoded_xml.replace("&","&");
|
---|
462 | output_value = new Text("[" + entry_key + "]\n" + decoded_xml);
|
---|
463 | }
|
---|
464 | else if (entry_type.equals("src"))
|
---|
465 | {
|
---|
466 | String encoded_xml = line_buffer.toString();
|
---|
467 | String decoded_xml = encoded_xml.replace(""","\"");
|
---|
468 | decoded_xml = decoded_xml.replace("'","'");
|
---|
469 | decoded_xml = decoded_xml.replace("<","<");
|
---|
470 | decoded_xml = decoded_xml.replace(">",">");
|
---|
471 | decoded_xml = decoded_xml.replace("&","&");
|
---|
472 | output_value = new Text(entry_key + "|" + decoded_xml);
|
---|
473 | }
|
---|
474 | else if (entry_type.equals("rss"))
|
---|
475 | {
|
---|
476 | String encoded_xml = line_buffer.toString();
|
---|
477 | String decoded_xml = encoded_xml.replace(""","\"");
|
---|
478 | decoded_xml = decoded_xml.replace("'","'");
|
---|
479 | decoded_xml = decoded_xml.replace("<","<");
|
---|
480 | decoded_xml = decoded_xml.replace(">",">");
|
---|
481 | decoded_xml = decoded_xml.replace("&","&");
|
---|
482 | output_value = new Text(decoded_xml);
|
---|
483 | }
|
---|
484 | else
|
---|
485 | {
|
---|
486 | output_value = new Text(line_buffer.toString());
|
---|
487 | }
|
---|
488 | entry_type = null;
|
---|
489 | entry_key = null;
|
---|
490 | entry_mode = null;
|
---|
491 | entry_time = null;
|
---|
492 | payload = null;
|
---|
493 | line_buffer = null;
|
---|
494 | }
|
---|
495 | // all other lines we'll key by host and time
|
---|
496 | else
|
---|
497 | {
|
---|
498 | double timestamp = ((double)System.currentTimeMillis())/1000;
|
---|
499 | String argument1 = hostname + ":" + cpu_number + ":" + String.format("%.6f", timestamp);
|
---|
500 | output_key = new Text("msg " + argument1);
|
---|
501 | output_value = new Text(argument1 + " " + line);
|
---|
502 | }
|
---|
503 |
|
---|
504 | // Store the result in the return context
|
---|
505 | context.write(output_key, output_value);
|
---|
506 |
|
---|
507 | // May as well do this here - indicate to the Hadoop framework that
|
---|
508 | // this process is still making progress (of some form)
|
---|
509 | context.progress();
|
---|
510 |
|
---|
511 | // Cleanup
|
---|
512 | open_tag_matcher = null;
|
---|
513 | output_key = null;
|
---|
514 | output_value = null;
|
---|
515 | }
|
---|
516 | open_tag_pattern = null;
|
---|
517 | close_tag_pattern = null;
|
---|
518 | try
|
---|
519 | {
|
---|
520 | int import_status = import_process.waitFor();
|
---|
521 | if (import_status != 0)
|
---|
522 | {
|
---|
523 | throw new Exception("exit status: " + import_status);
|
---|
524 | }
|
---|
525 | }
|
---|
526 | catch (Exception e)
|
---|
527 | {
|
---|
528 | System.err.println("Error! Import command failed (" + e.toString() + ")");
|
---|
529 | }
|
---|
530 |
|
---|
531 | // - stop the progress reporter as, one way or another, there will be no
|
---|
532 | // more progress
|
---|
533 | reporter.interrupt();
|
---|
534 | reporter = null; // force gc
|
---|
535 |
|
---|
536 | // - write end time to log
|
---|
537 | double end_time = ((double)System.currentTimeMillis())/1000;
|
---|
538 | fw1.write("[Completed:" + String.format("%.6f", end_time) + "]\n");
|
---|
539 | // - close our output to the log
|
---|
540 | fw1.close();
|
---|
541 | }
|
---|
542 | /** map(LongWritable,Text,Context) **/
|
---|
543 |
|
---|
544 | }
|
---|
545 | /** class GSMap **/
|
---|
546 |
|
---|
547 |
|
---|
548 | /** @class GSReducer
|
---|
549 | */
|
---|
550 | public static class GSReducer
|
---|
551 | extends Reducer<Text, Text, Text, Text>
|
---|
552 | {
|
---|
553 |
|
---|
554 |
|
---|
555 | /** Prepare the Reducer by looking up configuration for this collection to
|
---|
556 | * determine the appropriate tools to use to create databases.
|
---|
557 | */
|
---|
558 | public void setup (Context context)
|
---|
559 | {
|
---|
560 | }
|
---|
561 | /** setup() **/
|
---|
562 |
|
---|
563 |
|
---|
564 | /**
|
---|
565 | */
|
---|
566 | public void reduce(Text key, Iterable<Text> values, Context context)
|
---|
567 | throws IOException, InterruptedException
|
---|
568 | {
|
---|
569 | System.err.println("reduce(" + key.toString() + ", <values>, <context>)");
|
---|
570 | Configuration conf = context.getConfiguration();
|
---|
571 | String gsdl_home = conf.get("gsdlhome");
|
---|
572 | String collection = conf.get("collection");
|
---|
573 | String archives_dir = gsdl_home + "/collect/" + collection + "/archives";
|
---|
574 | //String archives_dir = conf.get("archivesdir");
|
---|
575 | // Eventually I'd like to read in database type from collect.cfg - or
|
---|
576 | // maybe have it passed in as part of the context - but I'll hardcode
|
---|
577 | // as GDBM for now as a proof of concept
|
---|
578 | String infodbtype = "tdb";
|
---|
579 |
|
---|
580 | String key_string = key.toString();
|
---|
581 | String[] key_parts = key_string.split(" ");
|
---|
582 | if (key_parts.length >= 2)
|
---|
583 | {
|
---|
584 | String type = key_parts[0];
|
---|
585 | String argument = key_parts[1];
|
---|
586 |
|
---|
587 | Iterator<Text> values_itr = values.iterator();
|
---|
588 | // There are basically five different cases based on the key's type
|
---|
589 | // - the first is datestamp... these are sorted earliest first, and
|
---|
590 | // since we only want the earliest we can ignore the rest!
|
---|
591 | if (type.equals("datestamp"))
|
---|
592 | {
|
---|
593 | Text earliest_datestamp = values_itr.next();
|
---|
594 | // Write this directly to file
|
---|
595 | try
|
---|
596 | {
|
---|
597 | FileWriter earliest_datestamp_fout = new FileWriter(archives_dir + "/earliestDatestamp");
|
---|
598 | earliest_datestamp_fout.write(earliest_datestamp.toString());
|
---|
599 | earliest_datestamp_fout.close();
|
---|
600 | }
|
---|
601 | catch (Exception ex)
|
---|
602 | {
|
---|
603 | ex.printStackTrace();
|
---|
604 | }
|
---|
605 | }
|
---|
606 | // For 'doc' types we open a pipe to the database creator and send all
|
---|
607 | // the values through for processing
|
---|
608 | else if (type.equals("doc"))
|
---|
609 | {
|
---|
610 | GSInfoDB archiveinf_doc = new GSInfoDB(gsdl_home, infodbtype, archives_dir + "/archiveinf-doc." + infodbtype);
|
---|
611 | while (values_itr.hasNext())
|
---|
612 | {
|
---|
613 | Text db_entry = values_itr.next();
|
---|
614 | archiveinf_doc.writeEntry(db_entry.toString());
|
---|
615 | }
|
---|
616 | archiveinf_doc.close();
|
---|
617 | }
|
---|
618 | // Similarly for 'src' types, except here we group all entries with the
|
---|
619 | // same key
|
---|
620 | else if (type.equals("src"))
|
---|
621 | {
|
---|
622 | // Sigh - you can only create this the TDB file on a local
|
---|
623 | // filesystem, so I have to create it here, and then move it into
|
---|
624 | // place when finished
|
---|
625 | GSInfoDB archiveinf_src = new GSInfoDB(gsdl_home, infodbtype, archives_dir + "/archiveinf-src." + infodbtype);
|
---|
626 | String current_file_path = "";
|
---|
627 | StringBuffer current_record = new StringBuffer();
|
---|
628 | Pattern file_path_pattern = Pattern.compile("(.*?)\\|(.*)");
|
---|
629 | while (values_itr.hasNext())
|
---|
630 | {
|
---|
631 | Text db_entry_raw = values_itr.next();
|
---|
632 | String db_entry = db_entry_raw.toString();
|
---|
633 | // Parse out the file path this entry refers to
|
---|
634 | Matcher file_path_matcher = file_path_pattern.matcher(db_entry);
|
---|
635 | if (file_path_matcher.matches())
|
---|
636 | {
|
---|
637 | String this_file_path = file_path_matcher.group(1);
|
---|
638 | String this_record = file_path_matcher.group(2);
|
---|
639 | // Output the record (if there is one) if the file path changes
|
---|
640 | if (!this_file_path.equals(current_file_path) && current_record.length() > 0)
|
---|
641 | {
|
---|
642 | archiveinf_src.writeEntry("[" + current_file_path + "]\n" + current_record.toString());
|
---|
643 | // store the next records details
|
---|
644 | current_file_path = this_file_path;
|
---|
645 | current_record = new StringBuffer(this_record);
|
---|
646 | }
|
---|
647 | // Append onto our growing record
|
---|
648 | else
|
---|
649 | {
|
---|
650 | current_file_path = this_file_path;
|
---|
651 | if (current_record.length() > 0)
|
---|
652 | {
|
---|
653 | current_record.append("\n");
|
---|
654 | }
|
---|
655 | current_record.append(this_record);
|
---|
656 | }
|
---|
657 | }
|
---|
658 | else
|
---|
659 | {
|
---|
660 | // Not a valid src entry?
|
---|
661 | }
|
---|
662 | }
|
---|
663 | if (!current_file_path.equals("") && current_record.length() > 0)
|
---|
664 | {
|
---|
665 | archiveinf_src.writeEntry("[" + current_file_path + "]\n" + current_record.toString());
|
---|
666 | }
|
---|
667 | archiveinf_src.close();
|
---|
668 | }
|
---|
669 | // For 'rss' we write all the entries - in order - to an XML file
|
---|
670 | else if (type.equals("rss"))
|
---|
671 | {
|
---|
672 | try
|
---|
673 | {
|
---|
674 | FileWriter rss_item_rdf = new FileWriter(archives_dir + "/rss-items.rdf");
|
---|
675 | while (values_itr.hasNext())
|
---|
676 | {
|
---|
677 | Text rss_entry = values_itr.next();
|
---|
678 | rss_item_rdf.write(rss_entry.toString() + "\n");
|
---|
679 | }
|
---|
680 | rss_item_rdf.close();
|
---|
681 | }
|
---|
682 | catch (Exception ex)
|
---|
683 | {
|
---|
684 | ex.printStackTrace();
|
---|
685 | }
|
---|
686 | }
|
---|
687 | // Everything else we assume are just process log messages - so get
|
---|
688 | // Hadoop to write in order to log (I may need to annotate these
|
---|
689 | // with the host so I can see which message came from which compute
|
---|
690 | // node).
|
---|
691 | else
|
---|
692 | {
|
---|
693 | Pattern file_path_pattern = Pattern.compile("([^ ]+) (.*)");
|
---|
694 | while (values_itr.hasNext())
|
---|
695 | {
|
---|
696 | String compound_value = values_itr.next().toString();
|
---|
697 | Pattern p = Pattern.compile("([^\\s]+) (.*)");
|
---|
698 | Matcher m = p.matcher(compound_value);
|
---|
699 | if (m.matches())
|
---|
700 | {
|
---|
701 | Text msg_key = new Text(m.group(1));
|
---|
702 | Text msg_value = new Text(m.group(2));
|
---|
703 | context.write(msg_key, msg_value);
|
---|
704 | }
|
---|
705 | }
|
---|
706 | }
|
---|
707 | }
|
---|
708 | else
|
---|
709 | {
|
---|
710 | System.err.println("Error! Failed to parse key: " + key.toString());
|
---|
711 | }
|
---|
712 | }
|
---|
713 | /** reduce(key, value, context) **/
|
---|
714 |
|
---|
715 | }
|
---|
716 | /** class GSReducer **/
|
---|
717 |
|
---|
718 | /** @function main
|
---|
719 | */
|
---|
720 | public static void main(String[] args)
|
---|
721 | throws Exception
|
---|
722 | {
|
---|
723 | if (args.length < 6)
|
---|
724 | {
|
---|
725 | System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n");
|
---|
726 | System.exit(0);
|
---|
727 | }
|
---|
728 |
|
---|
729 | Configuration conf = new Configuration();
|
---|
730 | conf.set("gsdlhome", args[0]);
|
---|
731 | conf.set("hadoophome", args[1]);
|
---|
732 | conf.set("collection", args[2]);
|
---|
733 | conf.set("archivesdir", args[3]);
|
---|
734 | conf.set("hdfsprefix", args[4]); // "HDThriftFS", "HDFSShell", or ""
|
---|
735 | conf.set("hdfsin", args[5]);
|
---|
736 | conf.set("hdfsout", args[6]);
|
---|
737 |
|
---|
738 | // Set the number of retries to 1 - hopefully one of the following will work
|
---|
739 | conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop
|
---|
740 | conf.setInt("mapreduce.map.maxattempts", 1); // Hadoop 2.0.3-alpha
|
---|
741 | conf.setInt("mapreduce.map.max.attempts", 1); // Solution on Web
|
---|
742 | // prevent timeouts
|
---|
743 | long milli_seconds = 4*60*60*1000; // 4 hour
|
---|
744 | conf.setLong("mapred.task.timeout", milli_seconds);
|
---|
745 |
|
---|
746 | Job job = new Job(conf, "hadoopgreenstoneingest");
|
---|
747 | job.setJarByClass(HadoopGreenstoneIngest2.class);
|
---|
748 |
|
---|
749 | job.setOutputKeyClass(Text.class);
|
---|
750 | job.setOutputValueClass(Text.class);
|
---|
751 |
|
---|
752 | // Register the map, combiner, and reducer classes
|
---|
753 | job.setMapperClass(GSMap.class);
|
---|
754 | job.setPartitionerClass(GSPartitioner.class);
|
---|
755 | job.setGroupingComparatorClass(GSGroupingComparator.class);
|
---|
756 | job.setReducerClass(GSReducer.class);
|
---|
757 |
|
---|
758 | // Sets the input and output handlers - may need to adjust input to provide
|
---|
759 | // a series of filenames (TextInputFormat will instead read in a text file
|
---|
760 | // and return each line...)
|
---|
761 | job.setInputFormatClass(GSFileInputFormat.class);
|
---|
762 | //job.setOutputFormatClass(NullOutputFormat.class);
|
---|
763 | job.setOutputFormatClass(TextOutputFormat.class);
|
---|
764 |
|
---|
765 | // Register the input and output paths
|
---|
766 | // - this input path should be to a file (in HDFS) that lists the paths to
|
---|
767 | // the manifest files
|
---|
768 | FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin")));
|
---|
769 | // - for now the output isn't that important, but in the future I may use
|
---|
770 | // this mechanism to produce a time based log.
|
---|
771 | FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout")));
|
---|
772 |
|
---|
773 | // Recommended notation despite my hatiness of ?: syntax
|
---|
774 | System.exit(job.waitForCompletion(true)?0:1);
|
---|
775 | }
|
---|
776 | /** main(String[]) **/
|
---|
777 |
|
---|
778 | /** @function copyFile(File, File)
|
---|
779 | *
|
---|
780 | * @author Josh Froelich @ stackoverflow.com
|
---|
781 | */
|
---|
782 | public static void copyFile(File sourceFile, File destFile)
|
---|
783 | throws IOException
|
---|
784 | {
|
---|
785 | if(!destFile.exists())
|
---|
786 | {
|
---|
787 | destFile.createNewFile();
|
---|
788 | }
|
---|
789 | FileChannel source = null;
|
---|
790 | FileChannel destination = null;
|
---|
791 | try
|
---|
792 | {
|
---|
793 | source = new FileInputStream(sourceFile).getChannel();
|
---|
794 | destination = new FileOutputStream(destFile).getChannel();
|
---|
795 | destination.transferFrom(source, 0, source.size());
|
---|
796 | }
|
---|
797 | finally
|
---|
798 | {
|
---|
799 | if(source != null)
|
---|
800 | {
|
---|
801 | source.close();
|
---|
802 | }
|
---|
803 | if(destination != null)
|
---|
804 | {
|
---|
805 | destination.close();
|
---|
806 | }
|
---|
807 | }
|
---|
808 | }
|
---|
809 | /** copyFile(File, File) **/
|
---|
810 |
|
---|
811 |
|
---|
812 | /** @function runCommand()
|
---|
813 | *
|
---|
814 | * A convenience method that calls an external command and returns its
|
---|
815 | * standard out as a string. Warning! Not safe if the command could return a
|
---|
816 | * large amount of text in the STDERR stream - may infinitely block.
|
---|
817 | *
|
---|
818 | */
|
---|
819 | public static String runCommand(String command)
|
---|
820 | {
|
---|
821 | ///ystem.err.println("[DEBUG] command: " + command);
|
---|
822 | StringBuffer result = new StringBuffer();
|
---|
823 | try
|
---|
824 | {
|
---|
825 | Runtime run = Runtime.getRuntime() ;
|
---|
826 | Process pr = run.exec(command) ;
|
---|
827 | pr.waitFor() ;
|
---|
828 | BufferedReader buf = new BufferedReader( new InputStreamReader( pr.getInputStream() ) ) ;
|
---|
829 | String line;
|
---|
830 | while ( ( line = buf.readLine() ) != null )
|
---|
831 | {
|
---|
832 | result.append(line);
|
---|
833 | }
|
---|
834 | }
|
---|
835 | catch (Exception ex)
|
---|
836 | {
|
---|
837 | System.err.println("Error! " + ex.getMessage());
|
---|
838 | }
|
---|
839 | ///ystem.err.println("[DEBUG] result: " + result.toString());
|
---|
840 | return result.toString();
|
---|
841 | }
|
---|
842 | /** runCommand() **/
|
---|
843 | }
|
---|