source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java@ 27641

Last change on this file since 27641 was 27641, checked in by jmt12, 11 years ago

Altered order of arguments and allow archives dir to be passed as argument - both of which were needed to support NFS access to HDFS

File size: 14.5 KB
Line 
1/** jmt12 **/
2package org.nzdl.gsdl;
3
4import java.io.BufferedOutputStream;
5import java.io.BufferedReader;
6import java.io.File;
7import java.io.FileOutputStream;
8import java.io.FileWriter;
9import java.io.InputStream;
10import java.io.InputStreamReader;
11import java.io.IOException;
12import java.io.PrintWriter;
13import java.lang.ProcessBuilder;
14import java.lang.ProcessBuilder.*;
15import java.lang.Thread;
16import java.net.InetAddress;
17import java.util.Map;
18
19import org.apache.hadoop.fs.Path;
20import org.apache.hadoop.conf.*;
21import org.apache.hadoop.io.*;
22import org.apache.hadoop.mapreduce.*;
23import org.apache.hadoop.mapreduce.Mapper.Context;
24import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
25import org.apache.hadoop.mapreduce.lib.input.FileSplit;
26import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
27import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
28import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
29import org.apache.hadoop.util.*;
30
31/** @class WordCount
32 */
33public class HadoopGreenstoneIngest
34{
35
36 /** @class GSFileRecordReader
37 */
38 public static class GSFileRecordReader
39 extends RecordReader<Text, IntWritable>
40 {
41 /** Uncompressed file name */
42 private Text current_key;
43
44 private IntWritable current_value = new IntWritable(1);
45
46 /** Used to indicate progress */
47 private boolean is_finished = false;
48
49 /**
50 */
51 @Override
52 public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
53 throws IOException, InterruptedException
54 {
55 FileSplit split = (FileSplit) inputSplit;
56 current_key = new Text(split.getPath().toString());
57 }
58 /** initialize() **/
59
60 /**
61 * We only ever have a single key/value
62 */
63 @Override
64 public boolean nextKeyValue()
65 throws IOException, InterruptedException
66 {
67 if (!is_finished)
68 {
69 is_finished = true;
70 return true;
71 }
72 return false;
73 }
74 /** nextKeyValue() **/
75
76 /** @function getProgress
77 * Rather than calculating progress, we just keep it simple
78 */
79 @Override
80 public float getProgress()
81 throws IOException, InterruptedException
82 {
83 return is_finished ? 1 : 0;
84 }
85 /** getProgress() **/
86
87 /**
88 * Returns the current key (name of the zipped file)
89 */
90 @Override
91 public Text getCurrentKey()
92 throws IOException, InterruptedException
93 {
94 return current_key;
95 }
96 /** getCurrentKey() **/
97
98 /**
99 * Returns the current value (contents of the zipped file)
100 */
101 @Override
102 public IntWritable getCurrentValue()
103 throws IOException, InterruptedException
104 {
105 return current_value;
106 }
107 /** getCurrentValue() **/
108
109 /**
110 * Close quietly, ignoring any exceptions
111 */
112 @Override
113 public void close()
114 throws IOException
115 {
116 // nothing to do
117 }
118 /** close() **/
119
120 }
121 /** GSFileRecordReader **/
122
123 /** @class GSFileInputFormat
124 */
125 public static class GSFileInputFormat
126 extends FileInputFormat<Text, IntWritable>
127 {
128 /**
129 * Don't split the files
130 */
131 @Override
132 protected boolean isSplitable(JobContext context, Path filename)
133 {
134 return false;
135 }
136 /** isSplitable() **/
137
138 /**
139 */
140 @Override
141 public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
142 throws IOException, InterruptedException
143 {
144 return new GSFileRecordReader();
145 }
146 /** createRecordReader() **/
147
148 }
149 /** class GSFileInputFormat **/
150
151 /** @class GSMap
152 */
153 public static class GSMap
154 extends Mapper<Text, IntWritable, Text, IntWritable>
155 {
156 /** @function map
157 * The key is the full path (HDFS) of the file to be processed.
158 */
159 public void map(Text key, IntWritable value, Context context)
160 throws IOException, InterruptedException
161 {
162 String file_path = key.toString();
163 // - configuration for the task
164 Configuration conf = context.getConfiguration();
165 String gsdlhome = conf.get("gsdlhome");
166 String hdfs_prefix = conf.get("hdfsprefix");
167 String hadoop_home = conf.get("hadoophome");
168 String collection = conf.get("collection");
169 String task_id = conf.get("mapred.task.id");
170 task_id = task_id.substring(8); // remove "attempt_" prefix
171
172 // Programatically rewrite the protocol as appropriate for the given
173 // archives directory (not necessary if path is local or NFS)
174 if (hdfs_prefix.equals("/hdfs"))
175 {
176 file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix);
177 }
178 else
179 {
180 file_path = file_path.replace("hdfs://", hdfs_prefix);
181 }
182
183 // - create a temporary directory
184 File greenstone_tmp_dir = new File("/tmp/greenstone");
185 if (!greenstone_tmp_dir.isDirectory())
186 {
187 greenstone_tmp_dir.mkdir();
188 }
189
190 // - open a unique log file
191 File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
192 // - start the log by writing the time and the manifest line
193 FileWriter fw1 = new FileWriter(import_process_log, true);
194 long start_time = System.currentTimeMillis()/1000;
195 StringBuffer header_block = new StringBuffer("[Started:");
196 header_block.append(start_time);
197 header_block.append("]\n[Host:");
198 header_block.append(InetAddress.getLocalHost().getHostName());
199 header_block.append("]\n[CPU:");
200 String getcpu_executable_cmd = gsdlhome + "/ext/parallel-building/linux/bin/getcpu";
201 File getcpu_executable = new File(getcpu_executable_cmd);
202 if (getcpu_executable.exists())
203 {
204 header_block.append(runCommand(getcpu_executable_cmd));
205 }
206 else
207 {
208 header_block.append("0");
209 }
210 header_block.append("]\n[Task:");
211 header_block.append(task_id);
212 header_block.append("]\n[Map:");
213 header_block.append(file_path);
214 header_block.append("=>");
215 header_block.append(value);
216 header_block.append("]\n");
217 fw1.write(header_block.toString());
218 header_block = null;
219
220 // - create a temporary manifest file to process this file. Overwrite any
221 // existing file
222 File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
223 FileWriter manifest_writer = new FileWriter(manifest_path);
224 manifest_writer.write("<Manifest version=\"2.0\">\n");
225 manifest_writer.write("\t<Index>\n");
226 manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
227 manifest_writer.write("\t</Index>\n");
228 manifest_writer.write("</Manifest>\n");
229 manifest_writer.close();
230
231 // - call Greenstone passing in the path to the manifest
232 ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
233 fw1.write("[Command:" + import_process_builder.command() + "]\n");
234 // - alter environment
235 Map<String, String> import_process_env = import_process_builder.environment();
236 // - path
237 String path = import_process_env.get("PATH");
238 path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
239 path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
240 path = hadoop_home + "/bin:" + path;
241 path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
242 path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
243 path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
244 path = gsdlhome + "/bin/script:" + path;
245 path = gsdlhome + "/bin/linux:" + path;
246 import_process_env.put("PATH", path);
247 fw1.write("[PATH: " + path + "]\n");
248 // - ld_library_path
249 import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
250 // - dyld_library_path
251 import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
252 // - misc
253 import_process_env.put("GSDLHOME", gsdlhome);
254 import_process_env.put("GSDLOS", "linux");
255 import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
256 // - installed extension paths
257 import_process_env.put("GEXTPARALLELBUILDING", gsdlhome + "/ext/parallel-building");
258 import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
259 import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
260 import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
261 // - Hadoop specific
262 import_process_env.put("HADOOP_PREFIX", hadoop_home);
263 fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
264
265 // - change working directory
266 import_process_builder.directory(new File(gsdlhome));
267 // - close our output to the log before opening in the process
268 fw1.close();
269
270 // - write output to log
271 import_process_builder.redirectErrorStream(true);
272 import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
273
274 // - create progress reporter (so Hadoop doesn't time us out)
275 Thread reporter = new HadoopProgressReporter(context, import_process_log);
276 reporter.start();
277
278 // - run process
279 Process import_process = import_process_builder.start();
280 try
281 {
282 int import_status = import_process.waitFor();
283 if (import_status != 0)
284 {
285 throw new Exception("exit status: " + import_status);
286 }
287 }
288 catch (Exception e)
289 {
290 System.err.println("Error! Import command failed (" + e.toString() + ")");
291 }
292
293 // - stop the progress reporter as, one way or another, there will be no
294 // more progress
295 reporter.interrupt();
296 reporter = null; // force gc
297
298 // - write end time to log
299 FileWriter fw2 = new FileWriter(import_process_log, true);
300 long end_time = System.currentTimeMillis()/1000;
301 fw2.write("[Completed:" + end_time + "]\n");
302 fw2.close();
303
304 // - for now return a dummy output. In the future I may want to parse the
305 // output from Greenstone as output and allow reducing to make me a
306 // pretty timebased log
307 context.write(key, value);
308 }
309 /** map(LongWritable,Text,Context) **/
310
311 }
312 /** class GSMap **/
313
314 /** @function main
315 */
316 public static void main(String[] args)
317 throws Exception
318 {
319 if (args.length < 6)
320 {
321 System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n");
322 System.exit(0);
323 }
324
325 Configuration conf = new Configuration();
326 conf.set("gsdlhome", args[0]);
327 conf.set("hadoophome", args[1]);
328 conf.set("collection", args[2]);
329 conf.set("archivesdir", args[3]);
330 conf.set("hdfsprefix", args[4]); // "HDThriftFS", "HDFSShell", or ""
331 conf.set("hdfsin", args[5]);
332 conf.set("hdfsout", args[6]);
333
334 // Set the number of retries to 1 - hopefully one of the following will work
335 conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop
336 conf.setInt("mapreduce.map.maxattempts", 1); // Hadoop 2.0.3-alpha
337 conf.setInt("mapreduce.map.max.attempts", 1); // Solution on Web
338 // prevent timeouts
339 long milli_seconds = 4*60*60*1000; // 4 hour
340 conf.setLong("mapred.task.timeout", milli_seconds);
341 Job job = new Job(conf, "hadoopgreenstoneingest");
342 job.setJarByClass(HadoopGreenstoneIngest.class);
343
344 job.setOutputKeyClass(Text.class);
345 job.setOutputValueClass(IntWritable.class);
346
347 // Register the map, combiner, and reducer classes
348 job.setMapperClass(GSMap.class);
349 // - in theory, uses the IdentityReducer by default, which simply returns
350 // the input as the output (so no processing)
351 job.setNumReduceTasks(0);
352
353 // Sets the input and output handlers - may need to adjust input to provide me
354 // a series of filenames (TextInputFormat will instead read in a text file and
355 // return each line...)
356 job.setInputFormatClass(GSFileInputFormat.class);
357 job.setOutputFormatClass(NullOutputFormat.class);
358 //job.setOutputFormatClass(TextOutputFormat.class);
359
360 // Register the input and output paths
361 // - this input path should be to a file (in HDFS) that lists the paths to
362 // the manifest files
363 FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin")));
364 // - for now the output isn't that important, but in the future I may use
365 // this mechanism to produce a time based log.
366 FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout")));
367
368 // Recommended notation despite my hatiness of ?: syntax
369 System.exit(job.waitForCompletion(true)?0:1);
370 }
371 /** main(String[]) **/
372
373 /** @function runCommand()
374 *
375 * A convenience method that calls an external command and returns its
376 * standard out as a string. Warning! Not safe if the command could return a
377 * large amount of text in the STDERR stream - may infinitely block.
378 *
379 */
380 public static String runCommand(String command)
381 {
382 StringBuffer result = new StringBuffer();
383 try
384 {
385 Runtime run = Runtime.getRuntime() ;
386 Process pr = run.exec(command) ;
387 pr.waitFor() ;
388 BufferedReader buf = new BufferedReader( new InputStreamReader( pr.getInputStream() ) ) ;
389 String line;
390 while ( ( line = buf.readLine() ) != null )
391 {
392 result.append(line);
393 }
394 }
395 catch (Exception ex)
396 {
397 System.err.println("Error! " + ex.getMessage());
398 }
399 return result.toString();
400 }
401 /** runCommand() **/
402}
403
404class HadoopProgressReporter
405extends Thread
406{
407
408 private Context hadoop_process;
409
410 private File log_file;
411
412 HadoopProgressReporter(Context hadoop_process, File log_file)
413 {
414 this.hadoop_process = hadoop_process;
415 //this.log_file = log_file;
416 this.log_file = new File("/tmp/hadoop_progress_reporter.log");
417 }
418
419 public void run()
420 {
421 try
422 {
423 while (!this.isInterrupted())
424 {
425 sleep(60000); // Wait a minute
426 //FileWriter fw1 = new FileWriter(this.log_file, true);
427 //long time = System.currentTimeMillis()/1000;
428 //fw1.write("[" + time + "] HadoopProgressReporter.progress()\n");
429 //fw1.close();
430 this.hadoop_process.progress(); // Inform Hadoop we are still processing
431 }
432 }
433 catch (InterruptedException iex)
434 {
435 // We've been interrupted: no more progress
436 }
437 catch (Exception ex)
438 {
439 ex.printStackTrace();
440 }
441 }
442}
Note: See TracBrowser for help on using the repository browser.