1 | /** jmt12 **/
|
---|
2 | package org.nzdl.gsdl;
|
---|
3 |
|
---|
4 | import java.io.BufferedOutputStream;
|
---|
5 | import java.io.BufferedReader;
|
---|
6 | import java.io.File;
|
---|
7 | import java.io.FileOutputStream;
|
---|
8 | import java.io.FileWriter;
|
---|
9 | import java.io.InputStream;
|
---|
10 | import java.io.InputStreamReader;
|
---|
11 | import java.io.IOException;
|
---|
12 | import java.io.PrintWriter;
|
---|
13 | import java.lang.ProcessBuilder;
|
---|
14 | import java.lang.ProcessBuilder.*;
|
---|
15 | import java.lang.Thread;
|
---|
16 | import java.net.InetAddress;
|
---|
17 | import java.util.Map;
|
---|
18 |
|
---|
19 | import org.apache.hadoop.fs.Path;
|
---|
20 | import org.apache.hadoop.conf.*;
|
---|
21 | import org.apache.hadoop.io.*;
|
---|
22 | import org.apache.hadoop.mapreduce.*;
|
---|
23 | import org.apache.hadoop.mapreduce.Mapper.Context;
|
---|
24 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
---|
25 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
---|
26 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
---|
27 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
---|
28 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
|
---|
29 | import org.apache.hadoop.util.*;
|
---|
30 |
|
---|
31 | /** @class WordCount
|
---|
32 | */
|
---|
33 | public class HadoopGreenstoneIngest
|
---|
34 | {
|
---|
35 |
|
---|
36 | /** @class GSFileRecordReader
|
---|
37 | */
|
---|
38 | public static class GSFileRecordReader
|
---|
39 | extends RecordReader<Text, IntWritable>
|
---|
40 | {
|
---|
41 | /** Uncompressed file name */
|
---|
42 | private Text current_key;
|
---|
43 |
|
---|
44 | private IntWritable current_value = new IntWritable(1);
|
---|
45 |
|
---|
46 | /** Used to indicate progress */
|
---|
47 | private boolean is_finished = false;
|
---|
48 |
|
---|
49 | /**
|
---|
50 | */
|
---|
51 | @Override
|
---|
52 | public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
|
---|
53 | throws IOException, InterruptedException
|
---|
54 | {
|
---|
55 | FileSplit split = (FileSplit) inputSplit;
|
---|
56 | current_key = new Text(split.getPath().toString());
|
---|
57 | }
|
---|
58 | /** initialize() **/
|
---|
59 |
|
---|
60 | /**
|
---|
61 | * We only ever have a single key/value
|
---|
62 | */
|
---|
63 | @Override
|
---|
64 | public boolean nextKeyValue()
|
---|
65 | throws IOException, InterruptedException
|
---|
66 | {
|
---|
67 | if (!is_finished)
|
---|
68 | {
|
---|
69 | is_finished = true;
|
---|
70 | return true;
|
---|
71 | }
|
---|
72 | return false;
|
---|
73 | }
|
---|
74 | /** nextKeyValue() **/
|
---|
75 |
|
---|
76 | /** @function getProgress
|
---|
77 | * Rather than calculating progress, we just keep it simple
|
---|
78 | */
|
---|
79 | @Override
|
---|
80 | public float getProgress()
|
---|
81 | throws IOException, InterruptedException
|
---|
82 | {
|
---|
83 | return is_finished ? 1 : 0;
|
---|
84 | }
|
---|
85 | /** getProgress() **/
|
---|
86 |
|
---|
87 | /**
|
---|
88 | * Returns the current key (name of the zipped file)
|
---|
89 | */
|
---|
90 | @Override
|
---|
91 | public Text getCurrentKey()
|
---|
92 | throws IOException, InterruptedException
|
---|
93 | {
|
---|
94 | return current_key;
|
---|
95 | }
|
---|
96 | /** getCurrentKey() **/
|
---|
97 |
|
---|
98 | /**
|
---|
99 | * Returns the current value (contents of the zipped file)
|
---|
100 | */
|
---|
101 | @Override
|
---|
102 | public IntWritable getCurrentValue()
|
---|
103 | throws IOException, InterruptedException
|
---|
104 | {
|
---|
105 | return current_value;
|
---|
106 | }
|
---|
107 | /** getCurrentValue() **/
|
---|
108 |
|
---|
109 | /**
|
---|
110 | * Close quietly, ignoring any exceptions
|
---|
111 | */
|
---|
112 | @Override
|
---|
113 | public void close()
|
---|
114 | throws IOException
|
---|
115 | {
|
---|
116 | // nothing to do
|
---|
117 | }
|
---|
118 | /** close() **/
|
---|
119 |
|
---|
120 | }
|
---|
121 | /** GSFileRecordReader **/
|
---|
122 |
|
---|
123 | /** @class GSFileInputFormat
|
---|
124 | */
|
---|
125 | public static class GSFileInputFormat
|
---|
126 | extends FileInputFormat<Text, IntWritable>
|
---|
127 | {
|
---|
128 | /**
|
---|
129 | * Don't split the files
|
---|
130 | */
|
---|
131 | @Override
|
---|
132 | protected boolean isSplitable(JobContext context, Path filename)
|
---|
133 | {
|
---|
134 | return false;
|
---|
135 | }
|
---|
136 | /** isSplitable() **/
|
---|
137 |
|
---|
138 | /**
|
---|
139 | */
|
---|
140 | @Override
|
---|
141 | public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
|
---|
142 | throws IOException, InterruptedException
|
---|
143 | {
|
---|
144 | return new GSFileRecordReader();
|
---|
145 | }
|
---|
146 | /** createRecordReader() **/
|
---|
147 |
|
---|
148 | }
|
---|
149 | /** class GSFileInputFormat **/
|
---|
150 |
|
---|
151 | /** @class GSMap
|
---|
152 | */
|
---|
153 | public static class GSMap
|
---|
154 | extends Mapper<Text, IntWritable, Text, IntWritable>
|
---|
155 | {
|
---|
156 | /** @function map
|
---|
157 | * The key is the full path (HDFS) of the file to be processed.
|
---|
158 | */
|
---|
159 | public void map(Text key, IntWritable value, Context context)
|
---|
160 | throws IOException, InterruptedException
|
---|
161 | {
|
---|
162 | String file_path = key.toString();
|
---|
163 | // - configuration for the task
|
---|
164 | Configuration conf = context.getConfiguration();
|
---|
165 | String gsdlhome = conf.get("gsdlhome");
|
---|
166 | String hdfs_prefix = conf.get("hdfsprefix");
|
---|
167 | String hadoop_prefix = conf.get("hadoopprefix");
|
---|
168 | String collection = conf.get("collection");
|
---|
169 | String task_id = conf.get("mapred.task.id");
|
---|
170 | task_id = task_id.substring(8); // remove "attempt_" prefix
|
---|
171 | // Programatically rewrite the protocol as appropriate for the given
|
---|
172 | // archives directory
|
---|
173 | file_path = file_path.replace("hdfs://", hdfs_prefix);
|
---|
174 | // - create a temporary directory
|
---|
175 | File greenstone_tmp_dir = new File("/tmp/greenstone");
|
---|
176 | if (!greenstone_tmp_dir.isDirectory())
|
---|
177 | {
|
---|
178 | greenstone_tmp_dir.mkdir();
|
---|
179 | }
|
---|
180 | // - open a unique log file
|
---|
181 | File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
|
---|
182 | // - start the log by writing the time and the manifest line
|
---|
183 | FileWriter fw1 = new FileWriter(import_process_log, true);
|
---|
184 | long start_time = System.currentTimeMillis()/1000;
|
---|
185 | StringBuffer header_block = new StringBuffer("[Started:");
|
---|
186 | header_block.append(start_time);
|
---|
187 | header_block.append("]\n[Host:");
|
---|
188 | header_block.append(InetAddress.getLocalHost().getHostName());
|
---|
189 | header_block.append("]\n[CPU:");
|
---|
190 | String getcpu_executable_cmd = gsdlhome + "/ext/parallel-building/linux/bin/getcpu";
|
---|
191 | File getcpu_executable = new File(getcpu_executable_cmd);
|
---|
192 | if (getcpu_executable.exists())
|
---|
193 | {
|
---|
194 | header_block.append(runCommand(getcpu_executable_cmd));
|
---|
195 | }
|
---|
196 | else
|
---|
197 | {
|
---|
198 | header_block.append("0");
|
---|
199 | }
|
---|
200 | header_block.append("]\n[Task:");
|
---|
201 | header_block.append(task_id);
|
---|
202 | header_block.append("]\n[Map:");
|
---|
203 | header_block.append(file_path);
|
---|
204 | header_block.append("=>");
|
---|
205 | header_block.append(value);
|
---|
206 | header_block.append("]\n");
|
---|
207 | fw1.write(header_block.toString());
|
---|
208 | header_block = null;
|
---|
209 |
|
---|
210 | // - create a temporary manifest file to process this file. Overwrite any
|
---|
211 | // existing file
|
---|
212 | File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
|
---|
213 | FileWriter manifest_writer = new FileWriter(manifest_path);
|
---|
214 | manifest_writer.write("<Manifest version=\"2.0\">\n");
|
---|
215 | manifest_writer.write("\t<Index>\n");
|
---|
216 | manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
|
---|
217 | manifest_writer.write("\t</Index>\n");
|
---|
218 | manifest_writer.write("</Manifest>\n");
|
---|
219 | manifest_writer.close();
|
---|
220 |
|
---|
221 | // - call Greenstone passing in the path to the manifest
|
---|
222 | ProcessBuilder import_process_builder
|
---|
223 | = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", hdfs_prefix + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection);
|
---|
224 | fw1.write("[Command:" + import_process_builder.command() + "]\n");
|
---|
225 | // - alter environment
|
---|
226 | Map<String, String> import_process_env = import_process_builder.environment();
|
---|
227 | // - path
|
---|
228 | String path = import_process_env.get("PATH");
|
---|
229 | path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
|
---|
230 | path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
|
---|
231 | path = hadoop_prefix + "/bin:" + path;
|
---|
232 | path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
|
---|
233 | path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
|
---|
234 | path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
|
---|
235 | path = gsdlhome + "/bin/script:" + path;
|
---|
236 | path = gsdlhome + "/bin/linux:" + path;
|
---|
237 | import_process_env.put("PATH", path);
|
---|
238 | fw1.write("[PATH: " + path + "]\n");
|
---|
239 | // - ld_library_path
|
---|
240 | import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
|
---|
241 | // - dyld_library_path
|
---|
242 | import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
|
---|
243 | // - misc
|
---|
244 | import_process_env.put("GSDLHOME", gsdlhome);
|
---|
245 | import_process_env.put("GSDLOS", "linux");
|
---|
246 | import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
|
---|
247 | // - installed extension paths
|
---|
248 | import_process_env.put("GEXTPARALLELBUILDING", gsdlhome + "/ext/parallel-building");
|
---|
249 | import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
|
---|
250 | import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
|
---|
251 | import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
|
---|
252 | // - Hadoop specific
|
---|
253 | import_process_env.put("HADOOP_PREFIX", hadoop_prefix);
|
---|
254 | fw1.write("[HADOOP_PREFIX: " + hadoop_prefix + "]\n");
|
---|
255 |
|
---|
256 | // - change working directory
|
---|
257 | import_process_builder.directory(new File(gsdlhome));
|
---|
258 | // - close our output to the log before opening in the process
|
---|
259 | fw1.close();
|
---|
260 |
|
---|
261 | // - write output to log
|
---|
262 | import_process_builder.redirectErrorStream(true);
|
---|
263 | import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
|
---|
264 |
|
---|
265 | // - create progress reporter (so Hadoop doesn't time us out)
|
---|
266 | Thread reporter = new HadoopProgressReporter(context, import_process_log);
|
---|
267 | reporter.start();
|
---|
268 |
|
---|
269 | // - run process
|
---|
270 | Process import_process = import_process_builder.start();
|
---|
271 | try
|
---|
272 | {
|
---|
273 | int import_status = import_process.waitFor();
|
---|
274 | if (import_status != 0)
|
---|
275 | {
|
---|
276 | throw new Exception("exit status: " + import_status);
|
---|
277 | }
|
---|
278 | }
|
---|
279 | catch (Exception e)
|
---|
280 | {
|
---|
281 | System.err.println("Error! Import command failed (" + e.toString() + ")");
|
---|
282 | }
|
---|
283 |
|
---|
284 | // - stop the progress reporter as, one way or another, there will be no
|
---|
285 | // more progress
|
---|
286 | reporter.interrupt();
|
---|
287 | reporter = null; // force gc
|
---|
288 |
|
---|
289 | // - write end time to log
|
---|
290 | FileWriter fw2 = new FileWriter(import_process_log, true);
|
---|
291 | long end_time = System.currentTimeMillis()/1000;
|
---|
292 | fw2.write("[Completed:" + end_time + "]\n");
|
---|
293 | fw2.close();
|
---|
294 |
|
---|
295 | // - for now return a dummy output. In the future I may want to parse the
|
---|
296 | // output from Greenstone as output and allow reducing to make me a
|
---|
297 | // pretty timebased log
|
---|
298 | context.write(key, value);
|
---|
299 | }
|
---|
300 | /** map(LongWritable,Text,Context) **/
|
---|
301 |
|
---|
302 | }
|
---|
303 | /** class GSMap **/
|
---|
304 |
|
---|
305 | /** @function main
|
---|
306 | */
|
---|
307 | public static void main(String[] args)
|
---|
308 | throws Exception
|
---|
309 | {
|
---|
310 | if (args.length < 6)
|
---|
311 | {
|
---|
312 | System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfsprefix> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n");
|
---|
313 | System.exit(0);
|
---|
314 | }
|
---|
315 |
|
---|
316 | Configuration conf = new Configuration();
|
---|
317 | conf.set("gsdlhome", args[0]);
|
---|
318 | conf.set("hdfsprefix", args[1]); // HDThriftFS or HDFSShell
|
---|
319 | conf.set("hadoopprefix", args[2]);
|
---|
320 | conf.set("collection", args[3]);
|
---|
321 | // Set the number of retries to 1 - hopefully one of the following will work
|
---|
322 | conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop
|
---|
323 | conf.setInt("mapreduce.map.maxattempts", 1); // Hadoop 2.0.3-alpha
|
---|
324 | conf.setInt("mapreduce.map.max.attempts", 1); // Solution on Web
|
---|
325 | // prevent timeouts
|
---|
326 | long milli_seconds = 4*60*60*1000; // 4 hour
|
---|
327 | conf.setLong("mapred.task.timeout", milli_seconds);
|
---|
328 | Job job = new Job(conf, "hadoopgreenstoneingest");
|
---|
329 | job.setJarByClass(HadoopGreenstoneIngest.class);
|
---|
330 |
|
---|
331 | job.setOutputKeyClass(Text.class);
|
---|
332 | job.setOutputValueClass(IntWritable.class);
|
---|
333 |
|
---|
334 | // Register the map, combiner, and reducer classes
|
---|
335 | job.setMapperClass(GSMap.class);
|
---|
336 | // - in theory, uses the IdentityReducer by default, which simply returns
|
---|
337 | // the input as the output (so no processing)
|
---|
338 | job.setNumReduceTasks(0);
|
---|
339 |
|
---|
340 | // Sets the input and output handlers - may need to adjust input to provide me
|
---|
341 | // a series of filenames (TextInputFormat will instead read in a text file and
|
---|
342 | // return each line...)
|
---|
343 | job.setInputFormatClass(GSFileInputFormat.class);
|
---|
344 | job.setOutputFormatClass(NullOutputFormat.class);
|
---|
345 | //job.setOutputFormatClass(TextOutputFormat.class);
|
---|
346 |
|
---|
347 | // Register the input and output paths
|
---|
348 | // - this input path should be to a file (in HDFS) that lists the paths to
|
---|
349 | // the manifest files
|
---|
350 | FileInputFormat.setInputPaths(job, new Path(args[4]));
|
---|
351 | // - for now the output isn't that important, but in the future I may use
|
---|
352 | // this mechanism to produce a time based log.
|
---|
353 | FileOutputFormat.setOutputPath(job, new Path(args[5]));
|
---|
354 |
|
---|
355 | // Recommended notation despite my hatiness of ?: syntax
|
---|
356 | System.exit(job.waitForCompletion(true)?0:1);
|
---|
357 | }
|
---|
358 | /** main(String[]) **/
|
---|
359 |
|
---|
360 | /** @function runCommand()
|
---|
361 | *
|
---|
362 | * A convenience method that calls an external command and returns its
|
---|
363 | * standard out as a string. Warning! Not safe if the command could return a
|
---|
364 | * large amount of text in the STDERR stream - may infinitely block.
|
---|
365 | *
|
---|
366 | */
|
---|
367 | public static String runCommand(String command)
|
---|
368 | {
|
---|
369 | StringBuffer result = new StringBuffer();
|
---|
370 | try
|
---|
371 | {
|
---|
372 | Runtime run = Runtime.getRuntime() ;
|
---|
373 | Process pr = run.exec(command) ;
|
---|
374 | pr.waitFor() ;
|
---|
375 | BufferedReader buf = new BufferedReader( new InputStreamReader( pr.getInputStream() ) ) ;
|
---|
376 | String line;
|
---|
377 | while ( ( line = buf.readLine() ) != null )
|
---|
378 | {
|
---|
379 | result.append(line);
|
---|
380 | }
|
---|
381 | }
|
---|
382 | catch (Exception ex)
|
---|
383 | {
|
---|
384 | System.err.println("Error! " + ex.getMessage());
|
---|
385 | }
|
---|
386 | return result.toString();
|
---|
387 | }
|
---|
388 | /** runCommand() **/
|
---|
389 | }
|
---|
390 |
|
---|
391 | class HadoopProgressReporter
|
---|
392 | extends Thread
|
---|
393 | {
|
---|
394 |
|
---|
395 | private Context hadoop_process;
|
---|
396 |
|
---|
397 | private File log_file;
|
---|
398 |
|
---|
399 | HadoopProgressReporter(Context hadoop_process, File log_file)
|
---|
400 | {
|
---|
401 | this.hadoop_process = hadoop_process;
|
---|
402 | //this.log_file = log_file;
|
---|
403 | this.log_file = new File("/tmp/hadoop_progress_reporter.log");
|
---|
404 | }
|
---|
405 |
|
---|
406 | public void run()
|
---|
407 | {
|
---|
408 | try
|
---|
409 | {
|
---|
410 | while (!this.isInterrupted())
|
---|
411 | {
|
---|
412 | sleep(60000); // Wait a minute
|
---|
413 | //FileWriter fw1 = new FileWriter(this.log_file, true);
|
---|
414 | //long time = System.currentTimeMillis()/1000;
|
---|
415 | //fw1.write("[" + time + "] HadoopProgressReporter.progress()\n");
|
---|
416 | //fw1.close();
|
---|
417 | this.hadoop_process.progress(); // Inform Hadoop we are still processing
|
---|
418 | }
|
---|
419 | }
|
---|
420 | catch (InterruptedException iex)
|
---|
421 | {
|
---|
422 | // We've been interrupted: no more progress
|
---|
423 | }
|
---|
424 | catch (Exception ex)
|
---|
425 | {
|
---|
426 | ex.printStackTrace();
|
---|
427 | }
|
---|
428 | }
|
---|
429 | }
|
---|