Context Navigation

source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java@ 27641

Last change on this file since 27641 was 27641, checked in by jmt12, 11 years ago
Altered order of arguments and allow archives dir to be passed as argument - both of which were needed to support NFS access to HDFS
File size: 14.5 KB

Line
1	/ jmt12 /
2	package org.nzdl.gsdl;
3
4	import java.io.BufferedOutputStream;
5	import java.io.BufferedReader;
6	import java.io.File;
7	import java.io.FileOutputStream;
8	import java.io.FileWriter;
9	import java.io.InputStream;
10	import java.io.InputStreamReader;
11	import java.io.IOException;
12	import java.io.PrintWriter;
13	import java.lang.ProcessBuilder;
14	import java.lang.ProcessBuilder.*;
15	import java.lang.Thread;
16	import java.net.InetAddress;
17	import java.util.Map;
18
19	import org.apache.hadoop.fs.Path;
20	import org.apache.hadoop.conf.*;
21	import org.apache.hadoop.io.*;
22	import org.apache.hadoop.mapreduce.*;
23	import org.apache.hadoop.mapreduce.Mapper.Context;
24	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
25	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
26	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
27	import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
28	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
29	import org.apache.hadoop.util.*;
30
31	/** @class WordCount
32	*/
33	public class HadoopGreenstoneIngest
34	{
35
36	/** @class GSFileRecordReader
37	*/
38	public static class GSFileRecordReader
39	extends RecordReader<Text, IntWritable>
40	{
41	/** Uncompressed file name */
42	private Text current_key;
43
44	private IntWritable current_value = new IntWritable(1);
45
46	/** Used to indicate progress */
47	private boolean is_finished = false;
48
49	/**
50	*/
51	@Override
52	public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
53	throws IOException, InterruptedException
54	{
55	FileSplit split = (FileSplit) inputSplit;
56	current_key = new Text(split.getPath().toString());
57	}
58	/ initialize() /
59
60	/**
61	* We only ever have a single key/value
62	*/
63	@Override
64	public boolean nextKeyValue()
65	throws IOException, InterruptedException
66	{
67	if (!is_finished)
68	{
69	is_finished = true;
70	return true;
71	}
72	return false;
73	}
74	/ nextKeyValue() /
75
76	/** @function getProgress
77	* Rather than calculating progress, we just keep it simple
78	*/
79	@Override
80	public float getProgress()
81	throws IOException, InterruptedException
82	{
83	return is_finished ? 1 : 0;
84	}
85	/ getProgress() /
86
87	/**
88	* Returns the current key (name of the zipped file)
89	*/
90	@Override
91	public Text getCurrentKey()
92	throws IOException, InterruptedException
93	{
94	return current_key;
95	}
96	/ getCurrentKey() /
97
98	/**
99	* Returns the current value (contents of the zipped file)
100	*/
101	@Override
102	public IntWritable getCurrentValue()
103	throws IOException, InterruptedException
104	{
105	return current_value;
106	}
107	/ getCurrentValue() /
108
109	/**
110	* Close quietly, ignoring any exceptions
111	*/
112	@Override
113	public void close()
114	throws IOException
115	{
116	// nothing to do
117	}
118	/ close() /
119
120	}
121	/ GSFileRecordReader /
122
123	/** @class GSFileInputFormat
124	*/
125	public static class GSFileInputFormat
126	extends FileInputFormat<Text, IntWritable>
127	{
128	/**
129	* Don't split the files
130	*/
131	@Override
132	protected boolean isSplitable(JobContext context, Path filename)
133	{
134	return false;
135	}
136	/ isSplitable() /
137
138	/**
139	*/
140	@Override
141	public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
142	throws IOException, InterruptedException
143	{
144	return new GSFileRecordReader();
145	}
146	/ createRecordReader() /
147
148	}
149	/ class GSFileInputFormat /
150
151	/** @class GSMap
152	*/
153	public static class GSMap
154	extends Mapper<Text, IntWritable, Text, IntWritable>
155	{
156	/** @function map
157	* The key is the full path (HDFS) of the file to be processed.
158	*/
159	public void map(Text key, IntWritable value, Context context)
160	throws IOException, InterruptedException
161	{
162	String file_path = key.toString();
163	// - configuration for the task
164	Configuration conf = context.getConfiguration();
165	String gsdlhome = conf.get("gsdlhome");
166	String hdfs_prefix = conf.get("hdfsprefix");
167	String hadoop_home = conf.get("hadoophome");
168	String collection = conf.get("collection");
169	String task_id = conf.get("mapred.task.id");
170	task_id = task_id.substring(8); // remove "attempt_" prefix
171
172	// Programatically rewrite the protocol as appropriate for the given
173	// archives directory (not necessary if path is local or NFS)
174	if (hdfs_prefix.equals("/hdfs"))
175	{
176	file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix);
177	}
178	else
179	{
180	file_path = file_path.replace("hdfs://", hdfs_prefix);
181	}
182
183	// - create a temporary directory
184	File greenstone_tmp_dir = new File("/tmp/greenstone");
185	if (!greenstone_tmp_dir.isDirectory())
186	{
187	greenstone_tmp_dir.mkdir();
188	}
189
190	// - open a unique log file
191	File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
192	// - start the log by writing the time and the manifest line
193	FileWriter fw1 = new FileWriter(import_process_log, true);
194	long start_time = System.currentTimeMillis()/1000;
195	StringBuffer header_block = new StringBuffer("[Started:");
196	header_block.append(start_time);
197	header_block.append("]\n[Host:");
198	header_block.append(InetAddress.getLocalHost().getHostName());
199	header_block.append("]\n[CPU:");
200	String getcpu_executable_cmd = gsdlhome + "/ext/parallel-building/linux/bin/getcpu";
201	File getcpu_executable = new File(getcpu_executable_cmd);
202	if (getcpu_executable.exists())
203	{
204	header_block.append(runCommand(getcpu_executable_cmd));
205	}
206	else
207	{
208	header_block.append("0");
209	}
210	header_block.append("]\n[Task:");
211	header_block.append(task_id);
212	header_block.append("]\n[Map:");
213	header_block.append(file_path);
214	header_block.append("=>");
215	header_block.append(value);
216	header_block.append("]\n");
217	fw1.write(header_block.toString());
218	header_block = null;
219
220	// - create a temporary manifest file to process this file. Overwrite any
221	// existing file
222	File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
223	FileWriter manifest_writer = new FileWriter(manifest_path);
224	manifest_writer.write("<Manifest version=\"2.0\">\n");
225	manifest_writer.write("\t<Index>\n");
226	manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
227	manifest_writer.write("\t</Index>\n");
228	manifest_writer.write("</Manifest>\n");
229	manifest_writer.close();
230
231	// - call Greenstone passing in the path to the manifest
232	ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
233	fw1.write("[Command:" + import_process_builder.command() + "]\n");
234	// - alter environment
235	Map<String, String> import_process_env = import_process_builder.environment();
236	// - path
237	String path = import_process_env.get("PATH");
238	path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
239	path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
240	path = hadoop_home + "/bin:" + path;
241	path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
242	path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
243	path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
244	path = gsdlhome + "/bin/script:" + path;
245	path = gsdlhome + "/bin/linux:" + path;
246	import_process_env.put("PATH", path);
247	fw1.write("[PATH: " + path + "]\n");
248	// - ld_library_path
249	import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
250	// - dyld_library_path
251	import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
252	// - misc
253	import_process_env.put("GSDLHOME", gsdlhome);
254	import_process_env.put("GSDLOS", "linux");
255	import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
256	// - installed extension paths
257	import_process_env.put("GEXTPARALLELBUILDING", gsdlhome + "/ext/parallel-building");
258	import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
259	import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
260	import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
261	// - Hadoop specific
262	import_process_env.put("HADOOP_PREFIX", hadoop_home);
263	fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
264
265	// - change working directory
266	import_process_builder.directory(new File(gsdlhome));
267	// - close our output to the log before opening in the process
268	fw1.close();
269
270	// - write output to log
271	import_process_builder.redirectErrorStream(true);
272	import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
273
274	// - create progress reporter (so Hadoop doesn't time us out)
275	Thread reporter = new HadoopProgressReporter(context, import_process_log);
276	reporter.start();
277
278	// - run process
279	Process import_process = import_process_builder.start();
280	try
281	{
282	int import_status = import_process.waitFor();
283	if (import_status != 0)
284	{
285	throw new Exception("exit status: " + import_status);
286	}
287	}
288	catch (Exception e)
289	{
290	System.err.println("Error! Import command failed (" + e.toString() + ")");
291	}
292
293	// - stop the progress reporter as, one way or another, there will be no
294	// more progress
295	reporter.interrupt();
296	reporter = null; // force gc
297
298	// - write end time to log
299	FileWriter fw2 = new FileWriter(import_process_log, true);
300	long end_time = System.currentTimeMillis()/1000;
301	fw2.write("[Completed:" + end_time + "]\n");
302	fw2.close();
303
304	// - for now return a dummy output. In the future I may want to parse the
305	// output from Greenstone as output and allow reducing to make me a
306	// pretty timebased log
307	context.write(key, value);
308	}
309	/ map(LongWritable,Text,Context) /
310
311	}
312	/ class GSMap /
313
314	/** @function main
315	*/
316	public static void main(String[] args)
317	throws Exception
318	{
319	if (args.length < 6)
320	{
321	System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n");
322	System.exit(0);
323	}
324
325	Configuration conf = new Configuration();
326	conf.set("gsdlhome", args[0]);
327	conf.set("hadoophome", args[1]);
328	conf.set("collection", args[2]);
329	conf.set("archivesdir", args[3]);
330	conf.set("hdfsprefix", args[4]); // "HDThriftFS", "HDFSShell", or ""
331	conf.set("hdfsin", args[5]);
332	conf.set("hdfsout", args[6]);
333
334	// Set the number of retries to 1 - hopefully one of the following will work
335	conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop
336	conf.setInt("mapreduce.map.maxattempts", 1); // Hadoop 2.0.3-alpha
337	conf.setInt("mapreduce.map.max.attempts", 1); // Solution on Web
338	// prevent timeouts
339	long milli_seconds = 46060*1000; // 4 hour
340	conf.setLong("mapred.task.timeout", milli_seconds);
341	Job job = new Job(conf, "hadoopgreenstoneingest");
342	job.setJarByClass(HadoopGreenstoneIngest.class);
343
344	job.setOutputKeyClass(Text.class);
345	job.setOutputValueClass(IntWritable.class);
346
347	// Register the map, combiner, and reducer classes
348	job.setMapperClass(GSMap.class);
349	// - in theory, uses the IdentityReducer by default, which simply returns
350	// the input as the output (so no processing)
351	job.setNumReduceTasks(0);
352
353	// Sets the input and output handlers - may need to adjust input to provide me
354	// a series of filenames (TextInputFormat will instead read in a text file and
355	// return each line...)
356	job.setInputFormatClass(GSFileInputFormat.class);
357	job.setOutputFormatClass(NullOutputFormat.class);
358	//job.setOutputFormatClass(TextOutputFormat.class);
359
360	// Register the input and output paths
361	// - this input path should be to a file (in HDFS) that lists the paths to
362	// the manifest files
363	FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin")));
364	// - for now the output isn't that important, but in the future I may use
365	// this mechanism to produce a time based log.
366	FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout")));
367
368	// Recommended notation despite my hatiness of ?: syntax
369	System.exit(job.waitForCompletion(true)?0:1);
370	}
371	/ main(String[]) /
372
373	/** @function runCommand()
374	*
375	* A convenience method that calls an external command and returns its
376	* standard out as a string. Warning! Not safe if the command could return a
377	* large amount of text in the STDERR stream - may infinitely block.
378	*
379	*/
380	public static String runCommand(String command)
381	{
382	StringBuffer result = new StringBuffer();
383	try
384	{
385	Runtime run = Runtime.getRuntime() ;
386	Process pr = run.exec(command) ;
387	pr.waitFor() ;
388	BufferedReader buf = new BufferedReader( new InputStreamReader( pr.getInputStream() ) ) ;
389	String line;
390	while ( ( line = buf.readLine() ) != null )
391	{
392	result.append(line);
393	}
394	}
395	catch (Exception ex)
396	{
397	System.err.println("Error! " + ex.getMessage());
398	}
399	return result.toString();
400	}
401	/ runCommand() /
402	}
403
404	class HadoopProgressReporter
405	extends Thread
406	{
407
408	private Context hadoop_process;
409
410	private File log_file;
411
412	HadoopProgressReporter(Context hadoop_process, File log_file)
413	{
414	this.hadoop_process = hadoop_process;
415	//this.log_file = log_file;
416	this.log_file = new File("/tmp/hadoop_progress_reporter.log");
417	}
418
419	public void run()
420	{
421	try
422	{
423	while (!this.isInterrupted())
424	{
425	sleep(60000); // Wait a minute
426	//FileWriter fw1 = new FileWriter(this.log_file, true);
427	//long time = System.currentTimeMillis()/1000;
428	//fw1.write("[" + time + "] HadoopProgressReporter.progress()\n");
429	//fw1.close();
430	this.hadoop_process.progress(); // Inform Hadoop we are still processing
431	}
432	}
433	catch (InterruptedException iex)
434	{
435	// We've been interrupted: no more progress
436	}
437	catch (Exception ex)
438	{
439	ex.printStackTrace();
440	}
441	}
442	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: