Context Navigation

source: gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest2.java@ 28192

Last change on this file since 28192 was 28192, checked in by jmt12, 11 years ago
Need to still output Greenstone messages to log otherwise I can't determine IO start/stop
File size: 26.8 KB

Rev	Line
[27910]	1	/ jmt12 /
	2	package org.nzdl.gsdl;
	3
	4	import org.nzdl.gsdl.GSGroupingComparator;
	5	import org.nzdl.gsdl.GSInfoDB;
	6	import org.nzdl.gsdl.GSPartitioner;
	7
	8	import java.io.*;
	9	import java.lang.Iterable;
	10	import java.lang.ProcessBuilder;
	11	import java.lang.ProcessBuilder.*;
	12	import java.lang.Thread;
	13	import java.net.InetAddress;
	14	import java.nio.channels.FileChannel;
	15	import java.util.Iterator;
	16	import java.util.Map;
	17	import java.util.regex.Matcher;
	18	import java.util.regex.Pattern;
	19
	20	import org.apache.hadoop.fs.Path;
	21	import org.apache.hadoop.conf.*;
	22	import org.apache.hadoop.io.*;
	23	import org.apache.hadoop.mapreduce.*;
	24	import org.apache.hadoop.mapreduce.Mapper.Context;
	25	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	26	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	27	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	28	import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
	29	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
	30	import org.apache.hadoop.util.*;
	31
	32	/** @class WordCount
	33	*/
	34	public class HadoopGreenstoneIngest2
	35	{
	36
	37	/** @class GSFileRecordReader
	38	*/
	39	public static class GSFileRecordReader
	40	extends RecordReader<Text, Text>
	41	{
	42	/** Uncompressed file name */
	43	private Text current_key;
	44
	45	private Text current_value = new Text("");
	46
	47	/** Used to indicate progress */
	48	private boolean is_finished = false;
	49
	50	/**
	51	*/
	52	@Override
	53	public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
	54	throws IOException, InterruptedException
	55	{
	56	FileSplit split = (FileSplit) inputSplit;
	57	current_key = new Text(split.getPath().toString());
	58	}
	59	/ initialize() /
	60
	61	/**
	62	* We only ever have a single key/value
	63	*/
	64	@Override
	65	public boolean nextKeyValue()
	66	throws IOException, InterruptedException
	67	{
	68	if (!is_finished)
	69	{
	70	is_finished = true;
	71	return true;
	72	}
	73	return false;
	74	}
	75	/ nextKeyValue() /
	76
	77	/** @function getProgress
	78	* Rather than calculating progress, we just keep it simple
	79	*/
	80	@Override
	81	public float getProgress()
	82	throws IOException, InterruptedException
	83	{
	84	return is_finished ? 1 : 0;
	85	}
	86	/ getProgress() /
	87
	88	/**
	89	* Returns the current key (name of the zipped file)
	90	*/
	91	@Override
	92	public Text getCurrentKey()
	93	throws IOException, InterruptedException
	94	{
	95	return current_key;
	96	}
	97	/ getCurrentKey() /
	98
	99	/**
	100	* Returns the current value (contents of the zipped file)
	101	*/
	102	@Override
	103	public Text getCurrentValue()
	104	throws IOException, InterruptedException
	105	{
	106	return current_value;
	107	}
	108	/ getCurrentValue() /
	109
	110	/**
	111	* Close quietly, ignoring any exceptions
	112	*/
	113	@Override
	114	public void close()
	115	throws IOException
	116	{
	117	// nothing to do
	118	}
	119	/ close() /
	120
	121	}
	122	/ GSFileRecordReader /
	123
	124	/** @class GSFileInputFormat
	125	*/
	126	public static class GSFileInputFormat
	127	extends FileInputFormat<Text, Text>
	128	{
	129	/**
	130	* Don't split the files
	131	*/
	132	@Override
	133	protected boolean isSplitable(JobContext context, Path filename)
	134	{
	135	return false;
	136	}
	137	/ isSplitable() /
	138
	139	/**
	140	*/
	141	@Override
	142	public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext content)
	143	throws IOException, InterruptedException
	144	{
	145	return new GSFileRecordReader();
	146	}
	147	/ createRecordReader() /
	148
	149	}
	150	/ class GSFileInputFormat /
	151
	152	/** @class GSMap
	153	*/
	154	public static class GSMap
	155	extends Mapper<Text, Text, Text, Text>
	156	{
	157	/** @function map
	158	* The key is the full path (HDFS) of the file to be processed.
	159	*/
	160	public void map(Text key, Text value, Context context)
	161	throws IOException, InterruptedException
	162	{
	163	String file_path = key.toString();
	164	// - configuration for the task
	165	Configuration conf = context.getConfiguration();
	166	String gsdlhome = conf.get("gsdlhome");
	167	String hdfs_prefix = conf.get("hdfsprefix");
	168	String hadoop_home = conf.get("hadoophome");
	169	String collection = conf.get("collection");
	170	String task_id = conf.get("mapred.task.id");
	171	task_id = task_id.substring(8); // remove "attempt_" prefix
	172
	173	// Programatically rewrite the protocol as appropriate for the given
	174	// archives directory (not necessary if path is local or NFS)
	175	if (hdfs_prefix.equals("/hdfs"))
	176	{
	177	file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix);
	178	}
	179	else
	180	{
	181	file_path = file_path.replace("hdfs://", hdfs_prefix);
	182	}
	183
	184	// - create a temporary directory
	185	File greenstone_tmp_dir = new File("/tmp/greenstone");
	186	if (!greenstone_tmp_dir.isDirectory())
	187	{
	188	greenstone_tmp_dir.mkdir();
	189	}
	190
	191	// - open a unique log file
	192	File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
	193	FileWriter fw1 = new FileWriter(import_process_log, true);
	194	// MEDUSA Customization: Introduce a slight delay based upon the hostname
	195	// in order to stagger the startup of Map workers. It looks like the avg
	196	// IO is around 25 minutes... so lets try to make it so the last mapper
	197	// starts up 25 minutes after the first (with all others spread in
	198	// between).
	199	String hostname = InetAddress.getLocalHost().getHostName();
	200	// We only do this if there is a sentinel file lurking in tmp
	201	try
	202	{
	203	File delay_file = new File("/tmp/greenstone/delay.me");
	204	if (delay_file.exists())
	205	{
	206	Pattern p = Pattern.compile("compute-0-([0-9]+).local");
	207	Matcher m = p.matcher(hostname);
	208	if (m.matches())
	209	{
	210	String node_str = m.group(1);
	211	int node_number = Integer.parseInt(node_str) * 100;
	212	fw1.write("[DEBUG] Delaying start for " + node_number + " seconds");
	213	Thread.currentThread().sleep(1000 * node_number);
	214	}
	215	// We only do this once for each compute node
	216	delay_file.delete();
	217	}
	218	}
	219	catch (Exception ie)
	220	{
	221	System.err.println(ie.toString());
	222	}
	223
	224	// - start the log by writing the time and the manifest line
[28012]	225	double start_time = ((double)System.currentTimeMillis())/1000;
[27910]	226	StringBuffer header_block = new StringBuffer("[Started:");
[28012]	227	header_block.append(String.format("%.6f", start_time));
[27910]	228	header_block.append("]\n[Host:");
	229	header_block.append(hostname);
	230	header_block.append("]\n[CPU:");
	231	String getcpu_executable_cmd = gsdlhome + "/ext/parallel-building/linux/bin/getcpu";
	232	File getcpu_executable = new File(getcpu_executable_cmd);
	233	String cpu_number = "0";
	234	if (getcpu_executable.exists())
	235	{
	236	cpu_number = runCommand(getcpu_executable_cmd);
	237	}
	238	header_block.append(cpu_number);
	239	header_block.append("]\n[Task:");
	240	header_block.append(task_id);
	241	header_block.append("]\n[Map:");
	242	header_block.append(file_path);
	243	header_block.append("=>");
	244	header_block.append(value);
	245	header_block.append("]\n");
	246	fw1.write(header_block.toString());
	247	header_block = null;
	248
	249	// - create a temporary manifest file to process this file. Overwrite any
	250	// existing file
	251	File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
	252	FileWriter manifest_writer = new FileWriter(manifest_path);
	253	manifest_writer.write("<Manifest version=\"2.0\">\n");
	254	manifest_writer.write("\t<Index>\n");
	255	manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
	256	manifest_writer.write("\t</Index>\n");
	257	manifest_writer.write("</Manifest>\n");
	258	manifest_writer.close();
	259
	260	// - call Greenstone passing in the path to the manifest
	261	ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
	262	fw1.write("[Command:" + import_process_builder.command() + "]\n");
	263	// - alter environment
	264	Map<String, String> import_process_env = import_process_builder.environment();
	265	// - path
	266	String path = import_process_env.get("PATH");
	267	path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
	268	path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
	269	path = hadoop_home + "/bin:" + path;
	270	path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
	271	path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
	272	path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
	273	path = gsdlhome + "/bin/script:" + path;
	274	path = gsdlhome + "/bin/linux:" + path;
	275	import_process_env.put("PATH", path);
	276	fw1.write("[PATH: " + path + "]\n");
	277	// - ld_library_path
	278	import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
	279	// - dyld_library_path
	280	import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
	281	// - misc
	282	import_process_env.put("GSDLHOME", gsdlhome);
	283	import_process_env.put("GSDLOS", "linux");
	284	import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
	285	// - installed extension paths
	286	import_process_env.put("GEXTPARALLELBUILDING", gsdlhome + "/ext/parallel-building");
	287	import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
	288	import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
	289	import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
	290	// - Hadoop specific
	291	import_process_env.put("HADOOP_PREFIX", hadoop_home);
	292	fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
	293
	294	// - change working directory
	295	import_process_builder.directory(new File(gsdlhome));
	296
	297	// - redirect STDERR to STDOUT for simplicity sake
	298	import_process_builder.redirectErrorStream(true);
	299	// Obsolete command to send output to file
	300	//import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
	301
	302	// - create progress reporter (so Hadoop doesn't time us out)
	303	Thread reporter = new HadoopProgressReporter(context, import_process_log);
	304	reporter.start();
	305
	306	// - run process
	307	Process import_process = import_process_builder.start();
	308	BufferedReader import_process_br = new BufferedReader(new InputStreamReader(import_process.getInputStream()));
	309	String line = "";
	310	Pattern open_tag_pattern = Pattern.compile("<InfoDBEntry type=\"(.+?)\" key=\"(.+?)\" mode=\"(.+?)\" timestamp=\"(.+?)\">(.*?(</InfoDBEntry>)?)");
	311	Pattern close_tag_pattern = Pattern.compile("(.*?)</InfoDBEntry>");
	312	while ((line = import_process_br.readLine()) != null)
	313	{
[28192]	314	// Write line to process log regardless
	315	fw1.write(line + "\n");
	316	// Now we check for sentinel strings in the output line
[27910]	317	Text output_key;
	318	Text output_value;
	319	// Watch for open entry tags
	320	Matcher open_tag_matcher = open_tag_pattern.matcher(line);
	321	if(open_tag_matcher.matches())
	322	{
	323	String entry_type = open_tag_matcher.group(1);
	324	String entry_key = open_tag_matcher.group(2);
	325	String entry_mode = open_tag_matcher.group(3);
	326	String entry_time = open_tag_matcher.group(4);
	327	String payload = open_tag_matcher.group(5);
	328	StringBuffer line_buffer = new StringBuffer();
	329	// Continue until we've found the close tag - or run out of output log
	330	Matcher close_tag_matcher = close_tag_pattern.matcher(payload);
	331	while (!close_tag_matcher.matches() && (line = import_process_br.readLine()) != null)
	332	{
	333	// append any existing payload to the buffer
	334	if (line_buffer.length() > 0)
	335	{
	336	line_buffer.append("\n");
	337	}
	338	line_buffer.append(payload);
	339	// store this line as the payload, should the match below fail
	340	payload = line;
	341	close_tag_matcher = close_tag_pattern.matcher(line);
	342	}
	343	// We've found the close tag (hopefully) so add last bit (possibly
	344	// empty string) to value
	345	if (close_tag_matcher.matches())
	346	{
	347	String last_payload = close_tag_matcher.group(1);
	348	if (line_buffer.length() > 0)
	349	{
	350	line_buffer.append("\n");
	351	}
	352	line_buffer.append(last_payload);
	353	last_payload = null;
	354	}
	355	close_tag_matcher = null;
	356
	357	// Construct the compound key by which to sort the entries - note
	358	// that src is a little different than the others, as we don't care
	359	// about the timestamp at all - instead wanting to group the entries
	360	// by src file path (key)
	361	if (entry_type.equals("src"))
	362	{
	363	output_key = new Text(entry_type + " " + entry_key);
	364	}
	365	else
	366	{
	367	output_key = new Text(entry_type + " " + entry_time);
	368	}
	369	// Doc has its payload prefixed by key
	370	if (entry_type.equals("doc"))
	371	{
	372	String encoded_xml = line_buffer.toString();
	373	String decoded_xml = encoded_xml.replace(""","\"");
	374	decoded_xml = decoded_xml.replace("'","'");
	375	decoded_xml = decoded_xml.replace("<","<");
	376	decoded_xml = decoded_xml.replace(">",">");
	377	decoded_xml = decoded_xml.replace("&","&");
	378	output_value = new Text("[" + entry_key + "]\n" + decoded_xml);
	379	}
	380	else if (entry_type.equals("src"))
	381	{
	382	String encoded_xml = line_buffer.toString();
	383	String decoded_xml = encoded_xml.replace(""","\"");
	384	decoded_xml = decoded_xml.replace("'","'");
	385	decoded_xml = decoded_xml.replace("<","<");
	386	decoded_xml = decoded_xml.replace(">",">");
	387	decoded_xml = decoded_xml.replace("&","&");
	388	output_value = new Text(entry_key + "\|" + decoded_xml);
	389	}
	390	else if (entry_type.equals("rss"))
	391	{
	392	String encoded_xml = line_buffer.toString();
	393	String decoded_xml = encoded_xml.replace(""","\"");
	394	decoded_xml = decoded_xml.replace("'","'");
	395	decoded_xml = decoded_xml.replace("<","<");
	396	decoded_xml = decoded_xml.replace(">",">");
	397	decoded_xml = decoded_xml.replace("&","&");
	398	output_value = new Text(decoded_xml);
	399	}
	400	else
	401	{
	402	output_value = new Text(line_buffer.toString());
	403	}
	404	entry_type = null;
	405	entry_key = null;
	406	entry_mode = null;
	407	entry_time = null;
	408	payload = null;
	409	line_buffer = null;
	410	}
	411	// all other lines we'll key by host and time
	412	else
	413	{
	414	double timestamp = ((double)System.currentTimeMillis())/1000;
	415	String argument1 = hostname + ":" + cpu_number + ":" + String.format("%.6f", timestamp);
	416	output_key = new Text("msg " + argument1);
	417	output_value = new Text(argument1 + " " + line);
	418	}
	419
	420	// Store the result in the return context
	421	context.write(output_key, output_value);
	422
	423	// May as well do this here - indicate to the Hadoop framework that
	424	// this process is still making progress (of some form)
	425	context.progress();
	426
	427	// Cleanup
	428	open_tag_matcher = null;
	429	output_key = null;
	430	output_value = null;
	431	}
	432	open_tag_pattern = null;
	433	close_tag_pattern = null;
	434	try
	435	{
	436	int import_status = import_process.waitFor();
	437	if (import_status != 0)
	438	{
	439	throw new Exception("exit status: " + import_status);
	440	}
	441	}
	442	catch (Exception e)
	443	{
	444	System.err.println("Error! Import command failed (" + e.toString() + ")");
	445	}
	446
	447	// - stop the progress reporter as, one way or another, there will be no
	448	// more progress
	449	reporter.interrupt();
	450	reporter = null; // force gc
	451
	452	// - write end time to log
	453	double end_time = ((double)System.currentTimeMillis())/1000;
[28192]	454	fw1.write("[Completed:" + String.format("%.6f", end_time) + "]\n");
	455	// - close our output to the log
	456	fw1.close();
[27910]	457	}
	458	/ map(LongWritable,Text,Context) /
	459
	460	}
	461	/ class GSMap /
	462
	463
	464	/** @class GSReducer
	465	*/
	466	public static class GSReducer
	467	extends Reducer<Text, Text, Text, Text>
	468	{
	469
	470
	471	/** Prepare the Reducer by looking up configuration for this collection to
	472	* determine the appropriate tools to use to create databases.
	473	*/
	474	public void setup (Context context)
	475	{
	476	}
	477	/ setup() /
	478
	479
	480	/**
	481	*/
	482	public void reduce(Text key, Iterable<Text> values, Context context)
	483	throws IOException, InterruptedException
	484	{
	485	System.err.println("reduce(" + key.toString() + ", <values>, <context>)");
	486	Configuration conf = context.getConfiguration();
	487	String gsdl_home = conf.get("gsdlhome");
	488	String collection = conf.get("collection");
[28012]	489	String archives_dir = gsdl_home + "/collect/" + collection + "/archives";
	490	//String archives_dir = conf.get("archivesdir");
[27910]	491	// Eventually I'd like to read in database type from collect.cfg - or
	492	// maybe have it passed in as part of the context - but I'll hardcode
	493	// as GDBM for now as a proof of concept
	494	String infodbtype = "tdb";
	495
	496	String key_string = key.toString();
	497	String[] key_parts = key_string.split(" ");
	498	if (key_parts.length >= 2)
	499	{
	500	String type = key_parts[0];
	501	String argument = key_parts[1];
	502
	503	Iterator<Text> values_itr = values.iterator();
	504	// There are basically five different cases based on the key's type
	505	// - the first is datestamp... these are sorted earliest first, and
	506	// since we only want the earliest we can ignore the rest!
	507	if (type.equals("datestamp"))
	508	{
	509	Text earliest_datestamp = values_itr.next();
	510	// Write this directly to file
	511	try
	512	{
	513	FileWriter earliest_datestamp_fout = new FileWriter(archives_dir + "/earliestDatestamp");
	514	earliest_datestamp_fout.write(earliest_datestamp.toString());
	515	earliest_datestamp_fout.close();
	516	}
	517	catch (Exception ex)
	518	{
	519	ex.printStackTrace();
	520	}
	521	}
	522	// For 'doc' types we open a pipe to the database creator and send all
	523	// the values through for processing
	524	else if (type.equals("doc"))
	525	{
	526	GSInfoDB archiveinf_doc = new GSInfoDB(gsdl_home, infodbtype, archives_dir + "/archiveinf-doc." + infodbtype);
	527	while (values_itr.hasNext())
	528	{
	529	Text db_entry = values_itr.next();
	530	archiveinf_doc.writeEntry(db_entry.toString());
	531	}
	532	archiveinf_doc.close();
	533	}
	534	// Similarly for 'src' types, except here we group all entries with the
	535	// same key
	536	else if (type.equals("src"))
	537	{
	538	// Sigh - you can only create this the TDB file on a local
	539	// filesystem, so I have to create it here, and then move it into
	540	// place when finished
	541	GSInfoDB archiveinf_src = new GSInfoDB(gsdl_home, infodbtype, archives_dir + "/archiveinf-src." + infodbtype);
	542	String current_file_path = "";
	543	StringBuffer current_record = new StringBuffer();
	544	Pattern file_path_pattern = Pattern.compile("(.?)\\\|(.)");
	545	while (values_itr.hasNext())
	546	{
	547	Text db_entry_raw = values_itr.next();
	548	String db_entry = db_entry_raw.toString();
	549	// Parse out the file path this entry refers to
	550	Matcher file_path_matcher = file_path_pattern.matcher(db_entry);
	551	if (file_path_matcher.matches())
	552	{
	553	String this_file_path = file_path_matcher.group(1);
	554	String this_record = file_path_matcher.group(2);
	555	// Output the record (if there is one) if the file path changes
	556	if (!this_file_path.equals(current_file_path) && current_record.length() > 0)
	557	{
	558	archiveinf_src.writeEntry("[" + current_file_path + "]\n" + current_record.toString());
	559	// store the next records details
	560	current_file_path = this_file_path;
	561	current_record = new StringBuffer(this_record);
	562	}
	563	// Append onto our growing record
	564	else
	565	{
	566	current_file_path = this_file_path;
	567	if (current_record.length() > 0)
	568	{
	569	current_record.append("\n");
	570	}
	571	current_record.append(this_record);
	572	}
	573	}
	574	else
	575	{
	576	// Not a valid src entry?
	577	}
	578	}
	579	if (!current_file_path.equals("") && current_record.length() > 0)
	580	{
	581	archiveinf_src.writeEntry("[" + current_file_path + "]\n" + current_record.toString());
	582	}
	583	archiveinf_src.close();
	584	}
	585	// For 'rss' we write all the entries - in order - to an XML file
	586	else if (type.equals("rss"))
	587	{
	588	try
	589	{
	590	FileWriter rss_item_rdf = new FileWriter(archives_dir + "/rss-items.rdf");
	591	while (values_itr.hasNext())
	592	{
	593	Text rss_entry = values_itr.next();
	594	rss_item_rdf.write(rss_entry.toString() + "\n");
	595	}
	596	rss_item_rdf.close();
	597	}
	598	catch (Exception ex)
	599	{
	600	ex.printStackTrace();
	601	}
	602	}
	603	// Everything else we assume are just process log messages - so get
	604	// Hadoop to write in order to log (I may need to annotate these
	605	// with the host so I can see which message came from which compute
	606	// node).
	607	else
	608	{
	609	Pattern file_path_pattern = Pattern.compile("([^ ]+) (.*)");
	610	while (values_itr.hasNext())
	611	{
	612	String compound_value = values_itr.next().toString();
	613	Pattern p = Pattern.compile("([^\\s]+) (.*)");
	614	Matcher m = p.matcher(compound_value);
	615	if (m.matches())
	616	{
	617	Text msg_key = new Text(m.group(1));
	618	Text msg_value = new Text(m.group(2));
	619	context.write(msg_key, msg_value);
	620	}
	621	}
	622	}
	623	}
	624	else
	625	{
	626	System.err.println("Error! Failed to parse key: " + key.toString());
	627	}
	628	}
	629	/ reduce(key, value, context) /
	630
	631	}
	632	/ class GSReducer /
	633
	634	/** @function main
	635	*/
	636	public static void main(String[] args)
	637	throws Exception
	638	{
	639	if (args.length < 6)
	640	{
	641	System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n");
	642	System.exit(0);
	643	}
	644
	645	Configuration conf = new Configuration();
	646	conf.set("gsdlhome", args[0]);
	647	conf.set("hadoophome", args[1]);
	648	conf.set("collection", args[2]);
	649	conf.set("archivesdir", args[3]);
	650	conf.set("hdfsprefix", args[4]); // "HDThriftFS", "HDFSShell", or ""
	651	conf.set("hdfsin", args[5]);
	652	conf.set("hdfsout", args[6]);
	653
	654	// Set the number of retries to 1 - hopefully one of the following will work
	655	conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop
	656	conf.setInt("mapreduce.map.maxattempts", 1); // Hadoop 2.0.3-alpha
	657	conf.setInt("mapreduce.map.max.attempts", 1); // Solution on Web
	658	// prevent timeouts
	659	long milli_seconds = 46060*1000; // 4 hour
	660	conf.setLong("mapred.task.timeout", milli_seconds);
	661
	662	Job job = new Job(conf, "hadoopgreenstoneingest");
	663	job.setJarByClass(HadoopGreenstoneIngest2.class);
	664
	665	job.setOutputKeyClass(Text.class);
	666	job.setOutputValueClass(Text.class);
	667
	668	// Register the map, combiner, and reducer classes
	669	job.setMapperClass(GSMap.class);
	670	job.setPartitionerClass(GSPartitioner.class);
	671	job.setGroupingComparatorClass(GSGroupingComparator.class);
	672	job.setReducerClass(GSReducer.class);
	673
	674	// Sets the input and output handlers - may need to adjust input to provide me
	675	// a series of filenames (TextInputFormat will instead read in a text file and
	676	// return each line...)
	677	job.setInputFormatClass(GSFileInputFormat.class);
	678	//job.setOutputFormatClass(NullOutputFormat.class);
	679	job.setOutputFormatClass(TextOutputFormat.class);
	680
	681	// Register the input and output paths
	682	// - this input path should be to a file (in HDFS) that lists the paths to
	683	// the manifest files
	684	FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin")));
	685	// - for now the output isn't that important, but in the future I may use
	686	// this mechanism to produce a time based log.
	687	FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout")));
	688
	689	// Recommended notation despite my hatiness of ?: syntax
	690	System.exit(job.waitForCompletion(true)?0:1);
	691	}
	692	/ main(String[]) /
	693
	694	/** @function copyFile(File, File)
	695	*
	696	* @author Josh Froelich @ stackoverflow.com
	697	*/
	698	public static void copyFile(File sourceFile, File destFile)
	699	throws IOException
	700	{
	701	if(!destFile.exists())
	702	{
	703	destFile.createNewFile();
	704	}
	705	FileChannel source = null;
	706	FileChannel destination = null;
	707	try
	708	{
	709	source = new FileInputStream(sourceFile).getChannel();
	710	destination = new FileOutputStream(destFile).getChannel();
	711	destination.transferFrom(source, 0, source.size());
	712	}
	713	finally
	714	{
	715	if(source != null)
	716	{
	717	source.close();
	718	}
	719	if(destination != null)
	720	{
	721	destination.close();
	722	}
	723	}
	724	}
	725	/ copyFile(File, File) /
	726
	727
	728	/** @function runCommand()
	729	*
	730	* A convenience method that calls an external command and returns its
	731	* standard out as a string. Warning! Not safe if the command could return a
	732	* large amount of text in the STDERR stream - may infinitely block.
	733	*
	734	*/
	735	public static String runCommand(String command)
	736	{
	737	///ystem.err.println("[DEBUG] command: " + command);
	738	StringBuffer result = new StringBuffer();
	739	try
	740	{
	741	Runtime run = Runtime.getRuntime() ;
	742	Process pr = run.exec(command) ;
	743	pr.waitFor() ;
	744	BufferedReader buf = new BufferedReader( new InputStreamReader( pr.getInputStream() ) ) ;
	745	String line;
	746	while ( ( line = buf.readLine() ) != null )
	747	{
	748	result.append(line);
	749	}
	750	}
	751	catch (Exception ex)
	752	{
	753	System.err.println("Error! " + ex.getMessage());
	754	}
	755	///ystem.err.println("[DEBUG] result: " + result.toString());
	756	return result.toString();
	757	}
	758	/ runCommand() /
	759	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: