Context Navigation

source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java@ 26980

Last change on this file since 26980 was 26980, checked in by jmt12, 11 years ago
setting svnignore
File size: 10.9 KB

Line
1	package org.nzdl.gsdl;
2
3	import java.io.BufferedOutputStream;
4	import java.io.BufferedReader;
5	import java.io.File;
6	import java.io.FileOutputStream;
7	import java.io.FileWriter;
8	import java.io.InputStream;
9	import java.io.InputStreamReader;
10	import java.io.IOException;
11	import java.io.PrintWriter;
12	import java.lang.ProcessBuilder;
13	import java.lang.ProcessBuilder.*;
14	import java.net.InetAddress;
15	import java.util.Map;
16
17	import org.apache.hadoop.fs.Path;
18	import org.apache.hadoop.conf.*;
19	import org.apache.hadoop.io.*;
20	import org.apache.hadoop.mapreduce.*;
21	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
22	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
23	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
24	import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
25	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
26	import org.apache.hadoop.util.*;
27
28	/** @class WordCount
29	*/
30	public class HadoopGreenstoneIngest
31	{
32
33	/** @class GSFileRecordReader
34	*/
35	public static class GSFileRecordReader
36	extends RecordReader<Text, IntWritable>
37	{
38	/** Uncompressed file name */
39	private Text current_key;
40
41	private IntWritable current_value = new IntWritable(1);
42
43	/** Used to indicate progress */
44	private boolean is_finished = false;
45
46	/**
47	*/
48	@Override
49	public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
50	throws IOException, InterruptedException
51	{
52	FileSplit split = (FileSplit) inputSplit;
53	current_key = new Text(split.getPath().toString());
54	}
55	/ initialize() /
56
57	/**
58	* We only ever have a single
59	*/
60	@Override
61	public boolean nextKeyValue()
62	throws IOException, InterruptedException
63	{
64	if (!is_finished)
65	{
66	is_finished = true;
67	return true;
68	}
69	return false;
70	}
71	/ nextKeyValue() /
72
73	/** @function getProgress
74	* Rather than calculating progress, we just keep it simple
75	*/
76	@Override
77	public float getProgress()
78	throws IOException, InterruptedException
79	{
80	return is_finished ? 1 : 0;
81	}
82	/ getProgress() /
83
84	/**
85	* Returns the current key (name of the zipped file)
86	*/
87	@Override
88	public Text getCurrentKey()
89	throws IOException, InterruptedException
90	{
91	return current_key;
92	}
93	/ getCurrentKey() /
94
95	/**
96	* Returns the current value (contents of the zipped file)
97	*/
98	@Override
99	public IntWritable getCurrentValue()
100	throws IOException, InterruptedException
101	{
102	return current_value;
103	}
104	/ getCurrentValue() /
105
106	/**
107	* Close quietly, ignoring any exceptions
108	*/
109	@Override
110	public void close()
111	throws IOException
112	{
113	// nothing to do
114	}
115	/ close() /
116
117	}
118	/ GSFileRecordReader /
119
120	/** @class GSFileInputFormat
121	*/
122	public static class GSFileInputFormat
123	extends FileInputFormat<Text, IntWritable>
124	{
125	/**
126	* Don't split the files
127	*/
128	@Override
129	protected boolean isSplitable(JobContext context, Path filename)
130	{
131	return false;
132	}
133	/ isSplitable() /
134
135	/**
136	*/
137	@Override
138	public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
139	throws IOException, InterruptedException
140	{
141	return new GSFileRecordReader();
142	}
143	/ createRecordReader() /
144
145	}
146	/ class GSFileInputFormat /
147
148	/** @class GSMap
149	*/
150	public static class GSMap
151	extends Mapper<Text, IntWritable, Text, IntWritable>
152	{
153	/** @function map
154	* The key is the full path (HDFS) of the file to be processed.
155	*/
156	public void map(Text key, IntWritable value, Context context)
157	throws IOException, InterruptedException
158	{
159	String file_path = key.toString();
160	// - configuration for the task
161	Configuration conf = context.getConfiguration();
162	String gsdlhome = conf.get("gsdlhome");
163	String hdfs_host = conf.get("hdfshost");
164	String hdfs_port = conf.get("hdfsport");
165	String hadoop_prefix = conf.get("hadoopprefix");
166	String collection = conf.get("collection");
167	String task_id = conf.get("mapred.task.id");
168	task_id = task_id.substring(8); // remove "attempt_" prefix
169	// - create a temporary directory
170	File greenstone_tmp_dir = new File("/tmp/greenstone");
171	if (!greenstone_tmp_dir.isDirectory())
172	{
173	greenstone_tmp_dir.mkdir();
174	}
175	// - open a unique log file
176	File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
177	// - start the log by writing the time and the manifest line
178	FileWriter fw1 = new FileWriter(import_process_log, true);
179	long start_time = System.currentTimeMillis()/1000;
180	fw1.write("[Started:" + start_time + "]\n");
181	fw1.write("[Host:" + InetAddress.getLocalHost().getHostName() + "]\n");
182	fw1.write("[Task:" + task_id + "]\n");
183	fw1.write("[Map:" + file_path + " => " + value + "]\n");
184
185	// - create a temporary manifest file to process this file. Overwrite any
186	// existing file
187	File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
188	FileWriter manifest_writer = new FileWriter(manifest_path);
189	manifest_writer.write("<Manifest version=\"1.0\">\n");
190	manifest_writer.write("\t<Index>\n");
191	manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
192	manifest_writer.write("\t</Index>\n");
193	manifest_writer.write("</Manifest>\n");
194	manifest_writer.close();
195
196	// - call Greenstone passing in the path to the manifest
197	ProcessBuilder import_process_builder
198	= new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", "hdfs://" + hdfs_host + ":" + hdfs_port + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection);
199	fw1.write("[Command:" + import_process_builder.command() + "]\n");
200	// - alter environment
201	Map<String, String> import_process_env = import_process_builder.environment();
202	// - path
203	String path = import_process_env.get("PATH");
204	path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
205	path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
206	path = hadoop_prefix + "/bin:" + path;
207	path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
208	path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
209	path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
210	path = gsdlhome + "/bin/script:" + path;
211	path = gsdlhome + "/bin/linux:" + path;
212	import_process_env.put("PATH", path);
213	// - ld_library_path
214	import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
215	// - dyld_library_path
216	import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
217	// - misc
218	import_process_env.put("GSDLHOME", gsdlhome);
219	import_process_env.put("GSDLOS", "linux");
220	import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
221	// - installed extension paths
222	import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
223	import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
224	import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
225	// - Hadoop specific
226	import_process_env.put("HADOOP_PREFIX", hadoop_prefix);
227	// - change working directory
228	import_process_builder.directory(new File(gsdlhome));
229	// - close our output to the log before opening in the process
230	fw1.close();
231
232	// - write output to log
233	import_process_builder.redirectErrorStream(true);
234	import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
235	// - run process
236	Process import_process = import_process_builder.start();
237	try
238	{
239	int import_status = import_process.waitFor();
240	if (import_status != 0)
241	{
242	throw new Exception("exit status: " + import_status);
243	}
244	}
245	catch (Exception e)
246	{
247	System.err.println("Error! Import command failed (" + e.toString() + ")");
248	System.exit(0);
249	}
250
251	// - write end time to log
252	FileWriter fw2 = new FileWriter(import_process_log, true);
253	long end_time = System.currentTimeMillis()/1000;
254	fw2.write("[Completed:" + end_time + "]\n");
255	fw2.close();
256
257	// - for now return a dummy output. In the future I may want to parse the
258	// output from Greenstone as output and allow reducing to make me a
259	// pretty timebased log
260	context.write(key, value);
261	}
262	/ map(LongWritable,Text,Context) /
263
264	}
265	/ class GSMap /
266
267	/** @function main
268	*/
269	public static void main(String[] args)
270	throws Exception
271	{
272	if (args.length < 6)
273	{
274	System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfs host> <hdfs port> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n");
275	System.exit(0);
276	}
277
278	Configuration conf = new Configuration();
279	conf.set("gsdlhome", args[0]);
280	conf.set("hdfshost", args[1]);
281	conf.set("hdfsport", args[2]);
282	conf.set("hadoopprefix", args[3]);
283	conf.set("collection", args[4]);
284	// prevent timeouts
285	long milli_seconds = 60601000; // 1 hour
286	conf.setLong("mapred.task.timeout", milli_seconds);
287
288	Job job = new Job(conf, "hadoopgreenstoneingest");
289	job.setJarByClass(HadoopGreenstoneIngest.class);
290
291	job.setOutputKeyClass(Text.class);
292	job.setOutputValueClass(IntWritable.class);
293
294	// Register the map, combiner, and reducer classes
295	job.setMapperClass(GSMap.class);
296	// - in theory, uses the IdentityReducer by default, which simply returns
297	// the input as the output (so no processing)
298	job.setNumReduceTasks(0);
299
300	// Sets the input and output handlers - may need to adjust input to provide me
301	// a series of filenames (TextInputFormat will instead read in a text file and
302	// return each line...)
303	job.setInputFormatClass(GSFileInputFormat.class);
304	job.setOutputFormatClass(NullOutputFormat.class);
305	//job.setOutputFormatClass(TextOutputFormat.class);
306
307	// Register the input and output paths
308	// - this input path should be to a file (in HDFS) that lists the paths to
309	// the manifest files
310	FileInputFormat.setInputPaths(job, new Path(args[5]));
311	// - for now the output isn't that important, but in the future I may use
312	// this mechanism to produce a time based log.
313	FileOutputFormat.setOutputPath(job, new Path(args[6]));
314
315	// Recommended notation despite my hatiness of ?: syntax
316	System.exit(job.waitForCompletion(true)?0:1);
317	}
318	/ main(String[]) /
319	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: