source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java@ 26982

Last change on this file since 26982 was 26982, checked in by jmt12, 11 years ago

Altered environment setup for subprocesses (due to Hadoop now moving into parallel-building extension) and added several more parameters to allow important environment details to be passed to compute nodes (HADOOPPREFIX, HDFSHOST, HDFSPORT)

File size: 10.9 KB
Line 
1/** jmt12 **/
2package org.nzdl.gsdl;
3
4import java.io.BufferedOutputStream;
5import java.io.BufferedReader;
6import java.io.File;
7import java.io.FileOutputStream;
8import java.io.FileWriter;
9import java.io.InputStream;
10import java.io.InputStreamReader;
11import java.io.IOException;
12import java.io.PrintWriter;
13import java.lang.ProcessBuilder;
14import java.lang.ProcessBuilder.*;
15import java.net.InetAddress;
16import java.util.Map;
17
18import org.apache.hadoop.fs.Path;
19import org.apache.hadoop.conf.*;
20import org.apache.hadoop.io.*;
21import org.apache.hadoop.mapreduce.*;
22import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
23import org.apache.hadoop.mapreduce.lib.input.FileSplit;
24import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
25import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
26import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
27import org.apache.hadoop.util.*;
28
29/** @class WordCount
30 */
31public class HadoopGreenstoneIngest
32{
33
34 /** @class GSFileRecordReader
35 */
36 public static class GSFileRecordReader
37 extends RecordReader<Text, IntWritable>
38 {
39 /** Uncompressed file name */
40 private Text current_key;
41
42 private IntWritable current_value = new IntWritable(1);
43
44 /** Used to indicate progress */
45 private boolean is_finished = false;
46
47 /**
48 */
49 @Override
50 public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
51 throws IOException, InterruptedException
52 {
53 FileSplit split = (FileSplit) inputSplit;
54 current_key = new Text(split.getPath().toString());
55 }
56 /** initialize() **/
57
58 /**
59 * We only ever have a single
60 */
61 @Override
62 public boolean nextKeyValue()
63 throws IOException, InterruptedException
64 {
65 if (!is_finished)
66 {
67 is_finished = true;
68 return true;
69 }
70 return false;
71 }
72 /** nextKeyValue() **/
73
74 /** @function getProgress
75 * Rather than calculating progress, we just keep it simple
76 */
77 @Override
78 public float getProgress()
79 throws IOException, InterruptedException
80 {
81 return is_finished ? 1 : 0;
82 }
83 /** getProgress() **/
84
85 /**
86 * Returns the current key (name of the zipped file)
87 */
88 @Override
89 public Text getCurrentKey()
90 throws IOException, InterruptedException
91 {
92 return current_key;
93 }
94 /** getCurrentKey() **/
95
96 /**
97 * Returns the current value (contents of the zipped file)
98 */
99 @Override
100 public IntWritable getCurrentValue()
101 throws IOException, InterruptedException
102 {
103 return current_value;
104 }
105 /** getCurrentValue() **/
106
107 /**
108 * Close quietly, ignoring any exceptions
109 */
110 @Override
111 public void close()
112 throws IOException
113 {
114 // nothing to do
115 }
116 /** close() **/
117
118 }
119 /** GSFileRecordReader **/
120
121 /** @class GSFileInputFormat
122 */
123 public static class GSFileInputFormat
124 extends FileInputFormat<Text, IntWritable>
125 {
126 /**
127 * Don't split the files
128 */
129 @Override
130 protected boolean isSplitable(JobContext context, Path filename)
131 {
132 return false;
133 }
134 /** isSplitable() **/
135
136 /**
137 */
138 @Override
139 public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
140 throws IOException, InterruptedException
141 {
142 return new GSFileRecordReader();
143 }
144 /** createRecordReader() **/
145
146 }
147 /** class GSFileInputFormat **/
148
149 /** @class GSMap
150 */
151 public static class GSMap
152 extends Mapper<Text, IntWritable, Text, IntWritable>
153 {
154 /** @function map
155 * The key is the full path (HDFS) of the file to be processed.
156 */
157 public void map(Text key, IntWritable value, Context context)
158 throws IOException, InterruptedException
159 {
160 String file_path = key.toString();
161 // - configuration for the task
162 Configuration conf = context.getConfiguration();
163 String gsdlhome = conf.get("gsdlhome");
164 String hdfs_host = conf.get("hdfshost");
165 String hdfs_port = conf.get("hdfsport");
166 String hadoop_prefix = conf.get("hadoopprefix");
167 String collection = conf.get("collection");
168 String task_id = conf.get("mapred.task.id");
169 task_id = task_id.substring(8); // remove "attempt_" prefix
170 // - create a temporary directory
171 File greenstone_tmp_dir = new File("/tmp/greenstone");
172 if (!greenstone_tmp_dir.isDirectory())
173 {
174 greenstone_tmp_dir.mkdir();
175 }
176 // - open a unique log file
177 File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
178 // - start the log by writing the time and the manifest line
179 FileWriter fw1 = new FileWriter(import_process_log, true);
180 long start_time = System.currentTimeMillis()/1000;
181 fw1.write("[Started:" + start_time + "]\n");
182 fw1.write("[Host:" + InetAddress.getLocalHost().getHostName() + "]\n");
183 fw1.write("[Task:" + task_id + "]\n");
184 fw1.write("[Map:" + file_path + " => " + value + "]\n");
185
186 // - create a temporary manifest file to process this file. Overwrite any
187 // existing file
188 File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
189 FileWriter manifest_writer = new FileWriter(manifest_path);
190 manifest_writer.write("<Manifest version=\"1.0\">\n");
191 manifest_writer.write("\t<Index>\n");
192 manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
193 manifest_writer.write("\t</Index>\n");
194 manifest_writer.write("</Manifest>\n");
195 manifest_writer.close();
196
197 // - call Greenstone passing in the path to the manifest
198 ProcessBuilder import_process_builder
199 = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", "hdfs://" + hdfs_host + ":" + hdfs_port + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection);
200 fw1.write("[Command:" + import_process_builder.command() + "]\n");
201 // - alter environment
202 Map<String, String> import_process_env = import_process_builder.environment();
203 // - path
204 String path = import_process_env.get("PATH");
205 path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
206 path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
207 path = hadoop_prefix + "/bin:" + path;
208 path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
209 path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
210 path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
211 path = gsdlhome + "/bin/script:" + path;
212 path = gsdlhome + "/bin/linux:" + path;
213 import_process_env.put("PATH", path);
214 // - ld_library_path
215 import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
216 // - dyld_library_path
217 import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
218 // - misc
219 import_process_env.put("GSDLHOME", gsdlhome);
220 import_process_env.put("GSDLOS", "linux");
221 import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
222 // - installed extension paths
223 import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
224 import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
225 import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
226 // - Hadoop specific
227 import_process_env.put("HADOOP_PREFIX", hadoop_prefix);
228 // - change working directory
229 import_process_builder.directory(new File(gsdlhome));
230 // - close our output to the log before opening in the process
231 fw1.close();
232
233 // - write output to log
234 import_process_builder.redirectErrorStream(true);
235 import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
236 // - run process
237 Process import_process = import_process_builder.start();
238 try
239 {
240 int import_status = import_process.waitFor();
241 if (import_status != 0)
242 {
243 throw new Exception("exit status: " + import_status);
244 }
245 }
246 catch (Exception e)
247 {
248 System.err.println("Error! Import command failed (" + e.toString() + ")");
249 System.exit(0);
250 }
251
252 // - write end time to log
253 FileWriter fw2 = new FileWriter(import_process_log, true);
254 long end_time = System.currentTimeMillis()/1000;
255 fw2.write("[Completed:" + end_time + "]\n");
256 fw2.close();
257
258 // - for now return a dummy output. In the future I may want to parse the
259 // output from Greenstone as output and allow reducing to make me a
260 // pretty timebased log
261 context.write(key, value);
262 }
263 /** map(LongWritable,Text,Context) **/
264
265 }
266 /** class GSMap **/
267
268 /** @function main
269 */
270 public static void main(String[] args)
271 throws Exception
272 {
273 if (args.length < 6)
274 {
275 System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfs host> <hdfs port> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n");
276 System.exit(0);
277 }
278
279 Configuration conf = new Configuration();
280 conf.set("gsdlhome", args[0]);
281 conf.set("hdfshost", args[1]);
282 conf.set("hdfsport", args[2]);
283 conf.set("hadoopprefix", args[3]);
284 conf.set("collection", args[4]);
285 // prevent timeouts
286 long milli_seconds = 60*60*1000; // 1 hour
287 conf.setLong("mapred.task.timeout", milli_seconds);
288
289 Job job = new Job(conf, "hadoopgreenstoneingest");
290 job.setJarByClass(HadoopGreenstoneIngest.class);
291
292 job.setOutputKeyClass(Text.class);
293 job.setOutputValueClass(IntWritable.class);
294
295 // Register the map, combiner, and reducer classes
296 job.setMapperClass(GSMap.class);
297 // - in theory, uses the IdentityReducer by default, which simply returns
298 // the input as the output (so no processing)
299 job.setNumReduceTasks(0);
300
301 // Sets the input and output handlers - may need to adjust input to provide me
302 // a series of filenames (TextInputFormat will instead read in a text file and
303 // return each line...)
304 job.setInputFormatClass(GSFileInputFormat.class);
305 job.setOutputFormatClass(NullOutputFormat.class);
306 //job.setOutputFormatClass(TextOutputFormat.class);
307
308 // Register the input and output paths
309 // - this input path should be to a file (in HDFS) that lists the paths to
310 // the manifest files
311 FileInputFormat.setInputPaths(job, new Path(args[5]));
312 // - for now the output isn't that important, but in the future I may use
313 // this mechanism to produce a time based log.
314 FileOutputFormat.setOutputPath(job, new Path(args[6]));
315
316 // Recommended notation despite my hatiness of ?: syntax
317 System.exit(job.waitForCompletion(true)?0:1);
318 }
319 /** main(String[]) **/
320}
Note: See TracBrowser for help on using the repository browser.