Context Navigation

source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java@ 26982

Last change on this file since 26982 was 26982, checked in by jmt12, 11 years ago
Altered environment setup for subprocesses (due to Hadoop now moving into parallel-building extension) and added several more parameters to allow important environment details to be passed to compute nodes (HADOOPPREFIX, HDFSHOST, HDFSPORT)
File size: 10.9 KB

Line
1	/ jmt12 /
2	package org.nzdl.gsdl;
3
4	import java.io.BufferedOutputStream;
5	import java.io.BufferedReader;
6	import java.io.File;
7	import java.io.FileOutputStream;
8	import java.io.FileWriter;
9	import java.io.InputStream;
10	import java.io.InputStreamReader;
11	import java.io.IOException;
12	import java.io.PrintWriter;
13	import java.lang.ProcessBuilder;
14	import java.lang.ProcessBuilder.*;
15	import java.net.InetAddress;
16	import java.util.Map;
17
18	import org.apache.hadoop.fs.Path;
19	import org.apache.hadoop.conf.*;
20	import org.apache.hadoop.io.*;
21	import org.apache.hadoop.mapreduce.*;
22	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
23	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
24	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
25	import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
26	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
27	import org.apache.hadoop.util.*;
28
29	/** @class WordCount
30	*/
31	public class HadoopGreenstoneIngest
32	{
33
34	/** @class GSFileRecordReader
35	*/
36	public static class GSFileRecordReader
37	extends RecordReader<Text, IntWritable>
38	{
39	/** Uncompressed file name */
40	private Text current_key;
41
42	private IntWritable current_value = new IntWritable(1);
43
44	/** Used to indicate progress */
45	private boolean is_finished = false;
46
47	/**
48	*/
49	@Override
50	public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
51	throws IOException, InterruptedException
52	{
53	FileSplit split = (FileSplit) inputSplit;
54	current_key = new Text(split.getPath().toString());
55	}
56	/ initialize() /
57
58	/**
59	* We only ever have a single
60	*/
61	@Override
62	public boolean nextKeyValue()
63	throws IOException, InterruptedException
64	{
65	if (!is_finished)
66	{
67	is_finished = true;
68	return true;
69	}
70	return false;
71	}
72	/ nextKeyValue() /
73
74	/** @function getProgress
75	* Rather than calculating progress, we just keep it simple
76	*/
77	@Override
78	public float getProgress()
79	throws IOException, InterruptedException
80	{
81	return is_finished ? 1 : 0;
82	}
83	/ getProgress() /
84
85	/**
86	* Returns the current key (name of the zipped file)
87	*/
88	@Override
89	public Text getCurrentKey()
90	throws IOException, InterruptedException
91	{
92	return current_key;
93	}
94	/ getCurrentKey() /
95
96	/**
97	* Returns the current value (contents of the zipped file)
98	*/
99	@Override
100	public IntWritable getCurrentValue()
101	throws IOException, InterruptedException
102	{
103	return current_value;
104	}
105	/ getCurrentValue() /
106
107	/**
108	* Close quietly, ignoring any exceptions
109	*/
110	@Override
111	public void close()
112	throws IOException
113	{
114	// nothing to do
115	}
116	/ close() /
117
118	}
119	/ GSFileRecordReader /
120
121	/** @class GSFileInputFormat
122	*/
123	public static class GSFileInputFormat
124	extends FileInputFormat<Text, IntWritable>
125	{
126	/**
127	* Don't split the files
128	*/
129	@Override
130	protected boolean isSplitable(JobContext context, Path filename)
131	{
132	return false;
133	}
134	/ isSplitable() /
135
136	/**
137	*/
138	@Override
139	public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
140	throws IOException, InterruptedException
141	{
142	return new GSFileRecordReader();
143	}
144	/ createRecordReader() /
145
146	}
147	/ class GSFileInputFormat /
148
149	/** @class GSMap
150	*/
151	public static class GSMap
152	extends Mapper<Text, IntWritable, Text, IntWritable>
153	{
154	/** @function map
155	* The key is the full path (HDFS) of the file to be processed.
156	*/
157	public void map(Text key, IntWritable value, Context context)
158	throws IOException, InterruptedException
159	{
160	String file_path = key.toString();
161	// - configuration for the task
162	Configuration conf = context.getConfiguration();
163	String gsdlhome = conf.get("gsdlhome");
164	String hdfs_host = conf.get("hdfshost");
165	String hdfs_port = conf.get("hdfsport");
166	String hadoop_prefix = conf.get("hadoopprefix");
167	String collection = conf.get("collection");
168	String task_id = conf.get("mapred.task.id");
169	task_id = task_id.substring(8); // remove "attempt_" prefix
170	// - create a temporary directory
171	File greenstone_tmp_dir = new File("/tmp/greenstone");
172	if (!greenstone_tmp_dir.isDirectory())
173	{
174	greenstone_tmp_dir.mkdir();
175	}
176	// - open a unique log file
177	File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
178	// - start the log by writing the time and the manifest line
179	FileWriter fw1 = new FileWriter(import_process_log, true);
180	long start_time = System.currentTimeMillis()/1000;
181	fw1.write("[Started:" + start_time + "]\n");
182	fw1.write("[Host:" + InetAddress.getLocalHost().getHostName() + "]\n");
183	fw1.write("[Task:" + task_id + "]\n");
184	fw1.write("[Map:" + file_path + " => " + value + "]\n");
185
186	// - create a temporary manifest file to process this file. Overwrite any
187	// existing file
188	File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
189	FileWriter manifest_writer = new FileWriter(manifest_path);
190	manifest_writer.write("<Manifest version=\"1.0\">\n");
191	manifest_writer.write("\t<Index>\n");
192	manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
193	manifest_writer.write("\t</Index>\n");
194	manifest_writer.write("</Manifest>\n");
195	manifest_writer.close();
196
197	// - call Greenstone passing in the path to the manifest
198	ProcessBuilder import_process_builder
199	= new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", "hdfs://" + hdfs_host + ":" + hdfs_port + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection);
200	fw1.write("[Command:" + import_process_builder.command() + "]\n");
201	// - alter environment
202	Map<String, String> import_process_env = import_process_builder.environment();
203	// - path
204	String path = import_process_env.get("PATH");
205	path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
206	path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
207	path = hadoop_prefix + "/bin:" + path;
208	path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
209	path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
210	path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
211	path = gsdlhome + "/bin/script:" + path;
212	path = gsdlhome + "/bin/linux:" + path;
213	import_process_env.put("PATH", path);
214	// - ld_library_path
215	import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
216	// - dyld_library_path
217	import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
218	// - misc
219	import_process_env.put("GSDLHOME", gsdlhome);
220	import_process_env.put("GSDLOS", "linux");
221	import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
222	// - installed extension paths
223	import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
224	import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
225	import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
226	// - Hadoop specific
227	import_process_env.put("HADOOP_PREFIX", hadoop_prefix);
228	// - change working directory
229	import_process_builder.directory(new File(gsdlhome));
230	// - close our output to the log before opening in the process
231	fw1.close();
232
233	// - write output to log
234	import_process_builder.redirectErrorStream(true);
235	import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
236	// - run process
237	Process import_process = import_process_builder.start();
238	try
239	{
240	int import_status = import_process.waitFor();
241	if (import_status != 0)
242	{
243	throw new Exception("exit status: " + import_status);
244	}
245	}
246	catch (Exception e)
247	{
248	System.err.println("Error! Import command failed (" + e.toString() + ")");
249	System.exit(0);
250	}
251
252	// - write end time to log
253	FileWriter fw2 = new FileWriter(import_process_log, true);
254	long end_time = System.currentTimeMillis()/1000;
255	fw2.write("[Completed:" + end_time + "]\n");
256	fw2.close();
257
258	// - for now return a dummy output. In the future I may want to parse the
259	// output from Greenstone as output and allow reducing to make me a
260	// pretty timebased log
261	context.write(key, value);
262	}
263	/ map(LongWritable,Text,Context) /
264
265	}
266	/ class GSMap /
267
268	/** @function main
269	*/
270	public static void main(String[] args)
271	throws Exception
272	{
273	if (args.length < 6)
274	{
275	System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfs host> <hdfs port> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n");
276	System.exit(0);
277	}
278
279	Configuration conf = new Configuration();
280	conf.set("gsdlhome", args[0]);
281	conf.set("hdfshost", args[1]);
282	conf.set("hdfsport", args[2]);
283	conf.set("hadoopprefix", args[3]);
284	conf.set("collection", args[4]);
285	// prevent timeouts
286	long milli_seconds = 60601000; // 1 hour
287	conf.setLong("mapred.task.timeout", milli_seconds);
288
289	Job job = new Job(conf, "hadoopgreenstoneingest");
290	job.setJarByClass(HadoopGreenstoneIngest.class);
291
292	job.setOutputKeyClass(Text.class);
293	job.setOutputValueClass(IntWritable.class);
294
295	// Register the map, combiner, and reducer classes
296	job.setMapperClass(GSMap.class);
297	// - in theory, uses the IdentityReducer by default, which simply returns
298	// the input as the output (so no processing)
299	job.setNumReduceTasks(0);
300
301	// Sets the input and output handlers - may need to adjust input to provide me
302	// a series of filenames (TextInputFormat will instead read in a text file and
303	// return each line...)
304	job.setInputFormatClass(GSFileInputFormat.class);
305	job.setOutputFormatClass(NullOutputFormat.class);
306	//job.setOutputFormatClass(TextOutputFormat.class);
307
308	// Register the input and output paths
309	// - this input path should be to a file (in HDFS) that lists the paths to
310	// the manifest files
311	FileInputFormat.setInputPaths(job, new Path(args[5]));
312	// - for now the output isn't that important, but in the future I may use
313	// this mechanism to produce a time based log.
314	FileOutputFormat.setOutputPath(job, new Path(args[6]));
315
316	// Recommended notation despite my hatiness of ?: syntax
317	System.exit(job.waitForCompletion(true)?0:1);
318	}
319	/ main(String[]) /
320	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: