source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java@ 26980

Last change on this file since 26980 was 26980, checked in by jmt12, 11 years ago

setting svnignore

File size: 10.9 KB
Line 
1package org.nzdl.gsdl;
2
3import java.io.BufferedOutputStream;
4import java.io.BufferedReader;
5import java.io.File;
6import java.io.FileOutputStream;
7import java.io.FileWriter;
8import java.io.InputStream;
9import java.io.InputStreamReader;
10import java.io.IOException;
11import java.io.PrintWriter;
12import java.lang.ProcessBuilder;
13import java.lang.ProcessBuilder.*;
14import java.net.InetAddress;
15import java.util.Map;
16
17import org.apache.hadoop.fs.Path;
18import org.apache.hadoop.conf.*;
19import org.apache.hadoop.io.*;
20import org.apache.hadoop.mapreduce.*;
21import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
22import org.apache.hadoop.mapreduce.lib.input.FileSplit;
23import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
24import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
25import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
26import org.apache.hadoop.util.*;
27
28/** @class WordCount
29 */
30public class HadoopGreenstoneIngest
31{
32
33 /** @class GSFileRecordReader
34 */
35 public static class GSFileRecordReader
36 extends RecordReader<Text, IntWritable>
37 {
38 /** Uncompressed file name */
39 private Text current_key;
40
41 private IntWritable current_value = new IntWritable(1);
42
43 /** Used to indicate progress */
44 private boolean is_finished = false;
45
46 /**
47 */
48 @Override
49 public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
50 throws IOException, InterruptedException
51 {
52 FileSplit split = (FileSplit) inputSplit;
53 current_key = new Text(split.getPath().toString());
54 }
55 /** initialize() **/
56
57 /**
58 * We only ever have a single
59 */
60 @Override
61 public boolean nextKeyValue()
62 throws IOException, InterruptedException
63 {
64 if (!is_finished)
65 {
66 is_finished = true;
67 return true;
68 }
69 return false;
70 }
71 /** nextKeyValue() **/
72
73 /** @function getProgress
74 * Rather than calculating progress, we just keep it simple
75 */
76 @Override
77 public float getProgress()
78 throws IOException, InterruptedException
79 {
80 return is_finished ? 1 : 0;
81 }
82 /** getProgress() **/
83
84 /**
85 * Returns the current key (name of the zipped file)
86 */
87 @Override
88 public Text getCurrentKey()
89 throws IOException, InterruptedException
90 {
91 return current_key;
92 }
93 /** getCurrentKey() **/
94
95 /**
96 * Returns the current value (contents of the zipped file)
97 */
98 @Override
99 public IntWritable getCurrentValue()
100 throws IOException, InterruptedException
101 {
102 return current_value;
103 }
104 /** getCurrentValue() **/
105
106 /**
107 * Close quietly, ignoring any exceptions
108 */
109 @Override
110 public void close()
111 throws IOException
112 {
113 // nothing to do
114 }
115 /** close() **/
116
117 }
118 /** GSFileRecordReader **/
119
120 /** @class GSFileInputFormat
121 */
122 public static class GSFileInputFormat
123 extends FileInputFormat<Text, IntWritable>
124 {
125 /**
126 * Don't split the files
127 */
128 @Override
129 protected boolean isSplitable(JobContext context, Path filename)
130 {
131 return false;
132 }
133 /** isSplitable() **/
134
135 /**
136 */
137 @Override
138 public RecordReader<Text, IntWritable> createRecordReader(InputSplit split, TaskAttemptContext content)
139 throws IOException, InterruptedException
140 {
141 return new GSFileRecordReader();
142 }
143 /** createRecordReader() **/
144
145 }
146 /** class GSFileInputFormat **/
147
148 /** @class GSMap
149 */
150 public static class GSMap
151 extends Mapper<Text, IntWritable, Text, IntWritable>
152 {
153 /** @function map
154 * The key is the full path (HDFS) of the file to be processed.
155 */
156 public void map(Text key, IntWritable value, Context context)
157 throws IOException, InterruptedException
158 {
159 String file_path = key.toString();
160 // - configuration for the task
161 Configuration conf = context.getConfiguration();
162 String gsdlhome = conf.get("gsdlhome");
163 String hdfs_host = conf.get("hdfshost");
164 String hdfs_port = conf.get("hdfsport");
165 String hadoop_prefix = conf.get("hadoopprefix");
166 String collection = conf.get("collection");
167 String task_id = conf.get("mapred.task.id");
168 task_id = task_id.substring(8); // remove "attempt_" prefix
169 // - create a temporary directory
170 File greenstone_tmp_dir = new File("/tmp/greenstone");
171 if (!greenstone_tmp_dir.isDirectory())
172 {
173 greenstone_tmp_dir.mkdir();
174 }
175 // - open a unique log file
176 File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
177 // - start the log by writing the time and the manifest line
178 FileWriter fw1 = new FileWriter(import_process_log, true);
179 long start_time = System.currentTimeMillis()/1000;
180 fw1.write("[Started:" + start_time + "]\n");
181 fw1.write("[Host:" + InetAddress.getLocalHost().getHostName() + "]\n");
182 fw1.write("[Task:" + task_id + "]\n");
183 fw1.write("[Map:" + file_path + " => " + value + "]\n");
184
185 // - create a temporary manifest file to process this file. Overwrite any
186 // existing file
187 File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
188 FileWriter manifest_writer = new FileWriter(manifest_path);
189 manifest_writer.write("<Manifest version=\"1.0\">\n");
190 manifest_writer.write("\t<Index>\n");
191 manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
192 manifest_writer.write("\t</Index>\n");
193 manifest_writer.write("</Manifest>\n");
194 manifest_writer.close();
195
196 // - call Greenstone passing in the path to the manifest
197 ProcessBuilder import_process_builder
198 = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", "hdfs://" + hdfs_host + ":" + hdfs_port + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection);
199 fw1.write("[Command:" + import_process_builder.command() + "]\n");
200 // - alter environment
201 Map<String, String> import_process_env = import_process_builder.environment();
202 // - path
203 String path = import_process_env.get("PATH");
204 path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
205 path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
206 path = hadoop_prefix + "/bin:" + path;
207 path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
208 path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
209 path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
210 path = gsdlhome + "/bin/script:" + path;
211 path = gsdlhome + "/bin/linux:" + path;
212 import_process_env.put("PATH", path);
213 // - ld_library_path
214 import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
215 // - dyld_library_path
216 import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
217 // - misc
218 import_process_env.put("GSDLHOME", gsdlhome);
219 import_process_env.put("GSDLOS", "linux");
220 import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
221 // - installed extension paths
222 import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
223 import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
224 import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
225 // - Hadoop specific
226 import_process_env.put("HADOOP_PREFIX", hadoop_prefix);
227 // - change working directory
228 import_process_builder.directory(new File(gsdlhome));
229 // - close our output to the log before opening in the process
230 fw1.close();
231
232 // - write output to log
233 import_process_builder.redirectErrorStream(true);
234 import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
235 // - run process
236 Process import_process = import_process_builder.start();
237 try
238 {
239 int import_status = import_process.waitFor();
240 if (import_status != 0)
241 {
242 throw new Exception("exit status: " + import_status);
243 }
244 }
245 catch (Exception e)
246 {
247 System.err.println("Error! Import command failed (" + e.toString() + ")");
248 System.exit(0);
249 }
250
251 // - write end time to log
252 FileWriter fw2 = new FileWriter(import_process_log, true);
253 long end_time = System.currentTimeMillis()/1000;
254 fw2.write("[Completed:" + end_time + "]\n");
255 fw2.close();
256
257 // - for now return a dummy output. In the future I may want to parse the
258 // output from Greenstone as output and allow reducing to make me a
259 // pretty timebased log
260 context.write(key, value);
261 }
262 /** map(LongWritable,Text,Context) **/
263
264 }
265 /** class GSMap **/
266
267 /** @function main
268 */
269 public static void main(String[] args)
270 throws Exception
271 {
272 if (args.length < 6)
273 {
274 System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfs host> <hdfs port> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n");
275 System.exit(0);
276 }
277
278 Configuration conf = new Configuration();
279 conf.set("gsdlhome", args[0]);
280 conf.set("hdfshost", args[1]);
281 conf.set("hdfsport", args[2]);
282 conf.set("hadoopprefix", args[3]);
283 conf.set("collection", args[4]);
284 // prevent timeouts
285 long milli_seconds = 60*60*1000; // 1 hour
286 conf.setLong("mapred.task.timeout", milli_seconds);
287
288 Job job = new Job(conf, "hadoopgreenstoneingest");
289 job.setJarByClass(HadoopGreenstoneIngest.class);
290
291 job.setOutputKeyClass(Text.class);
292 job.setOutputValueClass(IntWritable.class);
293
294 // Register the map, combiner, and reducer classes
295 job.setMapperClass(GSMap.class);
296 // - in theory, uses the IdentityReducer by default, which simply returns
297 // the input as the output (so no processing)
298 job.setNumReduceTasks(0);
299
300 // Sets the input and output handlers - may need to adjust input to provide me
301 // a series of filenames (TextInputFormat will instead read in a text file and
302 // return each line...)
303 job.setInputFormatClass(GSFileInputFormat.class);
304 job.setOutputFormatClass(NullOutputFormat.class);
305 //job.setOutputFormatClass(TextOutputFormat.class);
306
307 // Register the input and output paths
308 // - this input path should be to a file (in HDFS) that lists the paths to
309 // the manifest files
310 FileInputFormat.setInputPaths(job, new Path(args[5]));
311 // - for now the output isn't that important, but in the future I may use
312 // this mechanism to produce a time based log.
313 FileOutputFormat.setOutputPath(job, new Path(args[6]));
314
315 // Recommended notation despite my hatiness of ?: syntax
316 System.exit(job.waitForCompletion(true)?0:1);
317 }
318 /** main(String[]) **/
319}
Note: See TracBrowser for help on using the repository browser.