[27910] | 1 | /** jmt12 **/
|
---|
| 2 | package org.nzdl.gsdl;
|
---|
| 3 |
|
---|
| 4 | import org.nzdl.gsdl.GSGroupingComparator;
|
---|
| 5 | import org.nzdl.gsdl.GSInfoDB;
|
---|
| 6 | import org.nzdl.gsdl.GSPartitioner;
|
---|
| 7 |
|
---|
| 8 | import java.io.*;
|
---|
| 9 | import java.lang.Iterable;
|
---|
| 10 | import java.lang.ProcessBuilder;
|
---|
| 11 | import java.lang.ProcessBuilder.*;
|
---|
| 12 | import java.lang.Thread;
|
---|
| 13 | import java.net.InetAddress;
|
---|
| 14 | import java.nio.channels.FileChannel;
|
---|
| 15 | import java.util.Iterator;
|
---|
| 16 | import java.util.Map;
|
---|
| 17 | import java.util.regex.Matcher;
|
---|
| 18 | import java.util.regex.Pattern;
|
---|
| 19 |
|
---|
| 20 | import org.apache.hadoop.fs.Path;
|
---|
| 21 | import org.apache.hadoop.conf.*;
|
---|
| 22 | import org.apache.hadoop.io.*;
|
---|
| 23 | import org.apache.hadoop.mapreduce.*;
|
---|
| 24 | import org.apache.hadoop.mapreduce.Mapper.Context;
|
---|
| 25 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
|
---|
| 26 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
|
---|
| 27 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
|
---|
| 28 | import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
|
---|
| 29 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
|
---|
| 30 | import org.apache.hadoop.util.*;
|
---|
| 31 |
|
---|
| 32 | /** @class WordCount
|
---|
| 33 | */
|
---|
| 34 | public class HadoopGreenstoneIngest2
|
---|
| 35 | {
|
---|
| 36 |
|
---|
| 37 | /** @class GSFileRecordReader
|
---|
| 38 | */
|
---|
| 39 | public static class GSFileRecordReader
|
---|
| 40 | extends RecordReader<Text, Text>
|
---|
| 41 | {
|
---|
| 42 | /** Uncompressed file name */
|
---|
| 43 | private Text current_key;
|
---|
| 44 |
|
---|
| 45 | private Text current_value = new Text("");
|
---|
| 46 |
|
---|
| 47 | /** Used to indicate progress */
|
---|
| 48 | private boolean is_finished = false;
|
---|
| 49 |
|
---|
| 50 | /**
|
---|
| 51 | */
|
---|
| 52 | @Override
|
---|
| 53 | public void initialize( InputSplit inputSplit, TaskAttemptContext taskAttemptContext )
|
---|
| 54 | throws IOException, InterruptedException
|
---|
| 55 | {
|
---|
| 56 | FileSplit split = (FileSplit) inputSplit;
|
---|
| 57 | current_key = new Text(split.getPath().toString());
|
---|
| 58 | }
|
---|
| 59 | /** initialize() **/
|
---|
| 60 |
|
---|
| 61 | /**
|
---|
| 62 | * We only ever have a single key/value
|
---|
| 63 | */
|
---|
| 64 | @Override
|
---|
| 65 | public boolean nextKeyValue()
|
---|
| 66 | throws IOException, InterruptedException
|
---|
| 67 | {
|
---|
| 68 | if (!is_finished)
|
---|
| 69 | {
|
---|
| 70 | is_finished = true;
|
---|
| 71 | return true;
|
---|
| 72 | }
|
---|
| 73 | return false;
|
---|
| 74 | }
|
---|
| 75 | /** nextKeyValue() **/
|
---|
| 76 |
|
---|
| 77 | /** @function getProgress
|
---|
| 78 | * Rather than calculating progress, we just keep it simple
|
---|
| 79 | */
|
---|
| 80 | @Override
|
---|
| 81 | public float getProgress()
|
---|
| 82 | throws IOException, InterruptedException
|
---|
| 83 | {
|
---|
| 84 | return is_finished ? 1 : 0;
|
---|
| 85 | }
|
---|
| 86 | /** getProgress() **/
|
---|
| 87 |
|
---|
| 88 | /**
|
---|
| 89 | * Returns the current key (name of the zipped file)
|
---|
| 90 | */
|
---|
| 91 | @Override
|
---|
| 92 | public Text getCurrentKey()
|
---|
| 93 | throws IOException, InterruptedException
|
---|
| 94 | {
|
---|
| 95 | return current_key;
|
---|
| 96 | }
|
---|
| 97 | /** getCurrentKey() **/
|
---|
| 98 |
|
---|
| 99 | /**
|
---|
| 100 | * Returns the current value (contents of the zipped file)
|
---|
| 101 | */
|
---|
| 102 | @Override
|
---|
| 103 | public Text getCurrentValue()
|
---|
| 104 | throws IOException, InterruptedException
|
---|
| 105 | {
|
---|
| 106 | return current_value;
|
---|
| 107 | }
|
---|
| 108 | /** getCurrentValue() **/
|
---|
| 109 |
|
---|
| 110 | /**
|
---|
| 111 | * Close quietly, ignoring any exceptions
|
---|
| 112 | */
|
---|
| 113 | @Override
|
---|
| 114 | public void close()
|
---|
| 115 | throws IOException
|
---|
| 116 | {
|
---|
| 117 | // nothing to do
|
---|
| 118 | }
|
---|
| 119 | /** close() **/
|
---|
| 120 |
|
---|
| 121 | }
|
---|
| 122 | /** GSFileRecordReader **/
|
---|
| 123 |
|
---|
| 124 | /** @class GSFileInputFormat
|
---|
| 125 | */
|
---|
| 126 | public static class GSFileInputFormat
|
---|
| 127 | extends FileInputFormat<Text, Text>
|
---|
| 128 | {
|
---|
| 129 | /**
|
---|
| 130 | * Don't split the files
|
---|
| 131 | */
|
---|
| 132 | @Override
|
---|
| 133 | protected boolean isSplitable(JobContext context, Path filename)
|
---|
| 134 | {
|
---|
| 135 | return false;
|
---|
| 136 | }
|
---|
| 137 | /** isSplitable() **/
|
---|
| 138 |
|
---|
| 139 | /**
|
---|
| 140 | */
|
---|
| 141 | @Override
|
---|
| 142 | public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext content)
|
---|
| 143 | throws IOException, InterruptedException
|
---|
| 144 | {
|
---|
| 145 | return new GSFileRecordReader();
|
---|
| 146 | }
|
---|
| 147 | /** createRecordReader() **/
|
---|
| 148 |
|
---|
| 149 | }
|
---|
| 150 | /** class GSFileInputFormat **/
|
---|
| 151 |
|
---|
| 152 | /** @class GSMap
|
---|
| 153 | */
|
---|
| 154 | public static class GSMap
|
---|
| 155 | extends Mapper<Text, Text, Text, Text>
|
---|
| 156 | {
|
---|
| 157 | /** @function map
|
---|
| 158 | * The key is the full path (HDFS) of the file to be processed.
|
---|
| 159 | */
|
---|
| 160 | public void map(Text key, Text value, Context context)
|
---|
| 161 | throws IOException, InterruptedException
|
---|
| 162 | {
|
---|
| 163 | String file_path = key.toString();
|
---|
| 164 | // - configuration for the task
|
---|
| 165 | Configuration conf = context.getConfiguration();
|
---|
| 166 | String gsdlhome = conf.get("gsdlhome");
|
---|
| 167 | String hdfs_prefix = conf.get("hdfsprefix");
|
---|
| 168 | String hadoop_home = conf.get("hadoophome");
|
---|
| 169 | String collection = conf.get("collection");
|
---|
| 170 | String task_id = conf.get("mapred.task.id");
|
---|
| 171 | task_id = task_id.substring(8); // remove "attempt_" prefix
|
---|
| 172 |
|
---|
| 173 | // Programatically rewrite the protocol as appropriate for the given
|
---|
| 174 | // archives directory (not necessary if path is local or NFS)
|
---|
| 175 | if (hdfs_prefix.equals("/hdfs"))
|
---|
| 176 | {
|
---|
| 177 | file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix);
|
---|
| 178 | }
|
---|
| 179 | else
|
---|
| 180 | {
|
---|
| 181 | file_path = file_path.replace("hdfs://", hdfs_prefix);
|
---|
| 182 | }
|
---|
| 183 |
|
---|
| 184 | // - create a temporary directory
|
---|
| 185 | File greenstone_tmp_dir = new File("/tmp/greenstone");
|
---|
| 186 | if (!greenstone_tmp_dir.isDirectory())
|
---|
| 187 | {
|
---|
| 188 | greenstone_tmp_dir.mkdir();
|
---|
| 189 | }
|
---|
| 190 |
|
---|
| 191 | // - open a unique log file
|
---|
| 192 | File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log");
|
---|
| 193 | FileWriter fw1 = new FileWriter(import_process_log, true);
|
---|
| 194 | // MEDUSA Customization: Introduce a slight delay based upon the hostname
|
---|
| 195 | // in order to stagger the startup of Map workers. It looks like the avg
|
---|
| 196 | // IO is around 25 minutes... so lets try to make it so the last mapper
|
---|
| 197 | // starts up 25 minutes after the first (with all others spread in
|
---|
| 198 | // between).
|
---|
| 199 | String hostname = InetAddress.getLocalHost().getHostName();
|
---|
| 200 | // We only do this if there is a sentinel file lurking in tmp
|
---|
| 201 | try
|
---|
| 202 | {
|
---|
| 203 | File delay_file = new File("/tmp/greenstone/delay.me");
|
---|
| 204 | if (delay_file.exists())
|
---|
| 205 | {
|
---|
| 206 | Pattern p = Pattern.compile("compute-0-([0-9]+).local");
|
---|
| 207 | Matcher m = p.matcher(hostname);
|
---|
| 208 | if (m.matches())
|
---|
| 209 | {
|
---|
| 210 | String node_str = m.group(1);
|
---|
| 211 | int node_number = Integer.parseInt(node_str) * 100;
|
---|
| 212 | fw1.write("[DEBUG] Delaying start for " + node_number + " seconds");
|
---|
| 213 | Thread.currentThread().sleep(1000 * node_number);
|
---|
| 214 | }
|
---|
| 215 | // We only do this once for each compute node
|
---|
| 216 | delay_file.delete();
|
---|
| 217 | }
|
---|
| 218 | }
|
---|
| 219 | catch (Exception ie)
|
---|
| 220 | {
|
---|
| 221 | System.err.println(ie.toString());
|
---|
| 222 | }
|
---|
| 223 |
|
---|
| 224 | // - start the log by writing the time and the manifest line
|
---|
[28012] | 225 | double start_time = ((double)System.currentTimeMillis())/1000;
|
---|
[27910] | 226 | StringBuffer header_block = new StringBuffer("[Started:");
|
---|
[28012] | 227 | header_block.append(String.format("%.6f", start_time));
|
---|
[27910] | 228 | header_block.append("]\n[Host:");
|
---|
| 229 | header_block.append(hostname);
|
---|
| 230 | header_block.append("]\n[CPU:");
|
---|
| 231 | String getcpu_executable_cmd = gsdlhome + "/ext/parallel-building/linux/bin/getcpu";
|
---|
| 232 | File getcpu_executable = new File(getcpu_executable_cmd);
|
---|
| 233 | String cpu_number = "0";
|
---|
| 234 | if (getcpu_executable.exists())
|
---|
| 235 | {
|
---|
| 236 | cpu_number = runCommand(getcpu_executable_cmd);
|
---|
| 237 | }
|
---|
| 238 | header_block.append(cpu_number);
|
---|
| 239 | header_block.append("]\n[Task:");
|
---|
| 240 | header_block.append(task_id);
|
---|
| 241 | header_block.append("]\n[Map:");
|
---|
| 242 | header_block.append(file_path);
|
---|
| 243 | header_block.append("=>");
|
---|
| 244 | header_block.append(value);
|
---|
| 245 | header_block.append("]\n");
|
---|
| 246 | fw1.write(header_block.toString());
|
---|
| 247 | header_block = null;
|
---|
| 248 |
|
---|
| 249 | // - create a temporary manifest file to process this file. Overwrite any
|
---|
| 250 | // existing file
|
---|
| 251 | File manifest_path = new File("/tmp/greenstone/manifest" + task_id + ".xml");
|
---|
| 252 | FileWriter manifest_writer = new FileWriter(manifest_path);
|
---|
| 253 | manifest_writer.write("<Manifest version=\"2.0\">\n");
|
---|
| 254 | manifest_writer.write("\t<Index>\n");
|
---|
| 255 | manifest_writer.write("\t\t<Filename>" + file_path + "</Filename>\n");
|
---|
| 256 | manifest_writer.write("\t</Index>\n");
|
---|
| 257 | manifest_writer.write("</Manifest>\n");
|
---|
| 258 | manifest_writer.close();
|
---|
| 259 |
|
---|
| 260 | // - call Greenstone passing in the path to the manifest
|
---|
| 261 | ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
|
---|
| 262 | fw1.write("[Command:" + import_process_builder.command() + "]\n");
|
---|
| 263 | // - alter environment
|
---|
| 264 | Map<String, String> import_process_env = import_process_builder.environment();
|
---|
| 265 | // - path
|
---|
| 266 | String path = import_process_env.get("PATH");
|
---|
| 267 | path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
|
---|
| 268 | path = gsdlhome + "/ext/parallel-building/linux/bin:" + path;
|
---|
| 269 | path = hadoop_home + "/bin:" + path;
|
---|
| 270 | path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path;
|
---|
| 271 | path = gsdlhome + "/ext/tdb-edit/bin/script:" + path;
|
---|
| 272 | path = gsdlhome + "/ext/video-and-audio/linux/bin:" + path;
|
---|
| 273 | path = gsdlhome + "/bin/script:" + path;
|
---|
| 274 | path = gsdlhome + "/bin/linux:" + path;
|
---|
| 275 | import_process_env.put("PATH", path);
|
---|
| 276 | fw1.write("[PATH: " + path + "]\n");
|
---|
| 277 | // - ld_library_path
|
---|
| 278 | import_process_env.put("LD_LIBRARY_PATH", gsdlhome + "/ext/parallel-building/linux/lib:" + gsdlhome + "/ext/hadoop/linux/lib:" + gsdlhome + "/ext/video-and-audio/linux/lib:" + gsdlhome + "/ext/tdb-edit/linux/lib");
|
---|
| 279 | // - dyld_library_path
|
---|
| 280 | import_process_env.put("DYLD_LIBRARY_PATH", gsdlhome + "/ext/video-and-audio/linux/lib");
|
---|
| 281 | // - misc
|
---|
| 282 | import_process_env.put("GSDLHOME", gsdlhome);
|
---|
| 283 | import_process_env.put("GSDLOS", "linux");
|
---|
| 284 | import_process_env.put("GSDLEXTS", "parallel-building:tdb-edit:video-and-audio");
|
---|
| 285 | // - installed extension paths
|
---|
| 286 | import_process_env.put("GEXTPARALLELBUILDING", gsdlhome + "/ext/parallel-building");
|
---|
| 287 | import_process_env.put("GEXTPARALLELBUILDING_INSTALLED", gsdlhome + "/ext/parallel-building/linux");
|
---|
| 288 | import_process_env.put("GEXTTDBEDIT_INSTALLED", gsdlhome + "/ext/tdb-edit/linux");
|
---|
| 289 | import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux");
|
---|
| 290 | // - Hadoop specific
|
---|
| 291 | import_process_env.put("HADOOP_PREFIX", hadoop_home);
|
---|
| 292 | fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
|
---|
| 293 |
|
---|
| 294 | // - change working directory
|
---|
| 295 | import_process_builder.directory(new File(gsdlhome));
|
---|
| 296 |
|
---|
| 297 | // - redirect STDERR to STDOUT for simplicity sake
|
---|
| 298 | import_process_builder.redirectErrorStream(true);
|
---|
| 299 | // Obsolete command to send output to file
|
---|
| 300 | //import_process_builder.redirectOutput(Redirect.appendTo(import_process_log));
|
---|
| 301 |
|
---|
| 302 | // - create progress reporter (so Hadoop doesn't time us out)
|
---|
| 303 | Thread reporter = new HadoopProgressReporter(context, import_process_log);
|
---|
| 304 | reporter.start();
|
---|
| 305 |
|
---|
| 306 | // - run process
|
---|
| 307 | Process import_process = import_process_builder.start();
|
---|
| 308 | BufferedReader import_process_br = new BufferedReader(new InputStreamReader(import_process.getInputStream()));
|
---|
| 309 | String line = "";
|
---|
| 310 | Pattern open_tag_pattern = Pattern.compile("<InfoDBEntry type=\"(.+?)\" key=\"(.+?)\" mode=\"(.+?)\" timestamp=\"(.+?)\">(.*?(</InfoDBEntry>)?)");
|
---|
| 311 | Pattern close_tag_pattern = Pattern.compile("(.*?)</InfoDBEntry>");
|
---|
| 312 | while ((line = import_process_br.readLine()) != null)
|
---|
| 313 | {
|
---|
[28192] | 314 | // Write line to process log regardless
|
---|
| 315 | fw1.write(line + "\n");
|
---|
| 316 | // Now we check for sentinel strings in the output line
|
---|
[27910] | 317 | Text output_key;
|
---|
| 318 | Text output_value;
|
---|
| 319 | // Watch for open entry tags
|
---|
| 320 | Matcher open_tag_matcher = open_tag_pattern.matcher(line);
|
---|
| 321 | if(open_tag_matcher.matches())
|
---|
| 322 | {
|
---|
| 323 | String entry_type = open_tag_matcher.group(1);
|
---|
| 324 | String entry_key = open_tag_matcher.group(2);
|
---|
| 325 | String entry_mode = open_tag_matcher.group(3);
|
---|
| 326 | String entry_time = open_tag_matcher.group(4);
|
---|
| 327 | String payload = open_tag_matcher.group(5);
|
---|
| 328 | StringBuffer line_buffer = new StringBuffer();
|
---|
| 329 | // Continue until we've found the close tag - or run out of output log
|
---|
| 330 | Matcher close_tag_matcher = close_tag_pattern.matcher(payload);
|
---|
| 331 | while (!close_tag_matcher.matches() && (line = import_process_br.readLine()) != null)
|
---|
| 332 | {
|
---|
| 333 | // append any existing payload to the buffer
|
---|
| 334 | if (line_buffer.length() > 0)
|
---|
| 335 | {
|
---|
| 336 | line_buffer.append("\n");
|
---|
| 337 | }
|
---|
| 338 | line_buffer.append(payload);
|
---|
| 339 | // store this line as the payload, should the match below fail
|
---|
| 340 | payload = line;
|
---|
| 341 | close_tag_matcher = close_tag_pattern.matcher(line);
|
---|
| 342 | }
|
---|
| 343 | // We've found the close tag (hopefully) so add last bit (possibly
|
---|
| 344 | // empty string) to value
|
---|
| 345 | if (close_tag_matcher.matches())
|
---|
| 346 | {
|
---|
| 347 | String last_payload = close_tag_matcher.group(1);
|
---|
| 348 | if (line_buffer.length() > 0)
|
---|
| 349 | {
|
---|
| 350 | line_buffer.append("\n");
|
---|
| 351 | }
|
---|
| 352 | line_buffer.append(last_payload);
|
---|
| 353 | last_payload = null;
|
---|
| 354 | }
|
---|
| 355 | close_tag_matcher = null;
|
---|
| 356 |
|
---|
| 357 | // Construct the compound key by which to sort the entries - note
|
---|
| 358 | // that src is a little different than the others, as we don't care
|
---|
| 359 | // about the timestamp at all - instead wanting to group the entries
|
---|
| 360 | // by src file path (key)
|
---|
| 361 | if (entry_type.equals("src"))
|
---|
| 362 | {
|
---|
| 363 | output_key = new Text(entry_type + " " + entry_key);
|
---|
| 364 | }
|
---|
| 365 | else
|
---|
| 366 | {
|
---|
| 367 | output_key = new Text(entry_type + " " + entry_time);
|
---|
| 368 | }
|
---|
| 369 | // Doc has its payload prefixed by key
|
---|
| 370 | if (entry_type.equals("doc"))
|
---|
| 371 | {
|
---|
| 372 | String encoded_xml = line_buffer.toString();
|
---|
| 373 | String decoded_xml = encoded_xml.replace(""","\"");
|
---|
| 374 | decoded_xml = decoded_xml.replace("'","'");
|
---|
| 375 | decoded_xml = decoded_xml.replace("<","<");
|
---|
| 376 | decoded_xml = decoded_xml.replace(">",">");
|
---|
| 377 | decoded_xml = decoded_xml.replace("&","&");
|
---|
| 378 | output_value = new Text("[" + entry_key + "]\n" + decoded_xml);
|
---|
| 379 | }
|
---|
| 380 | else if (entry_type.equals("src"))
|
---|
| 381 | {
|
---|
| 382 | String encoded_xml = line_buffer.toString();
|
---|
| 383 | String decoded_xml = encoded_xml.replace(""","\"");
|
---|
| 384 | decoded_xml = decoded_xml.replace("'","'");
|
---|
| 385 | decoded_xml = decoded_xml.replace("<","<");
|
---|
| 386 | decoded_xml = decoded_xml.replace(">",">");
|
---|
| 387 | decoded_xml = decoded_xml.replace("&","&");
|
---|
| 388 | output_value = new Text(entry_key + "|" + decoded_xml);
|
---|
| 389 | }
|
---|
| 390 | else if (entry_type.equals("rss"))
|
---|
| 391 | {
|
---|
| 392 | String encoded_xml = line_buffer.toString();
|
---|
| 393 | String decoded_xml = encoded_xml.replace(""","\"");
|
---|
| 394 | decoded_xml = decoded_xml.replace("'","'");
|
---|
| 395 | decoded_xml = decoded_xml.replace("<","<");
|
---|
| 396 | decoded_xml = decoded_xml.replace(">",">");
|
---|
| 397 | decoded_xml = decoded_xml.replace("&","&");
|
---|
| 398 | output_value = new Text(decoded_xml);
|
---|
| 399 | }
|
---|
| 400 | else
|
---|
| 401 | {
|
---|
| 402 | output_value = new Text(line_buffer.toString());
|
---|
| 403 | }
|
---|
| 404 | entry_type = null;
|
---|
| 405 | entry_key = null;
|
---|
| 406 | entry_mode = null;
|
---|
| 407 | entry_time = null;
|
---|
| 408 | payload = null;
|
---|
| 409 | line_buffer = null;
|
---|
| 410 | }
|
---|
| 411 | // all other lines we'll key by host and time
|
---|
| 412 | else
|
---|
| 413 | {
|
---|
| 414 | double timestamp = ((double)System.currentTimeMillis())/1000;
|
---|
| 415 | String argument1 = hostname + ":" + cpu_number + ":" + String.format("%.6f", timestamp);
|
---|
| 416 | output_key = new Text("msg " + argument1);
|
---|
| 417 | output_value = new Text(argument1 + " " + line);
|
---|
| 418 | }
|
---|
| 419 |
|
---|
| 420 | // Store the result in the return context
|
---|
| 421 | context.write(output_key, output_value);
|
---|
| 422 |
|
---|
| 423 | // May as well do this here - indicate to the Hadoop framework that
|
---|
| 424 | // this process is still making progress (of some form)
|
---|
| 425 | context.progress();
|
---|
| 426 |
|
---|
| 427 | // Cleanup
|
---|
| 428 | open_tag_matcher = null;
|
---|
| 429 | output_key = null;
|
---|
| 430 | output_value = null;
|
---|
| 431 | }
|
---|
| 432 | open_tag_pattern = null;
|
---|
| 433 | close_tag_pattern = null;
|
---|
| 434 | try
|
---|
| 435 | {
|
---|
| 436 | int import_status = import_process.waitFor();
|
---|
| 437 | if (import_status != 0)
|
---|
| 438 | {
|
---|
| 439 | throw new Exception("exit status: " + import_status);
|
---|
| 440 | }
|
---|
| 441 | }
|
---|
| 442 | catch (Exception e)
|
---|
| 443 | {
|
---|
| 444 | System.err.println("Error! Import command failed (" + e.toString() + ")");
|
---|
| 445 | }
|
---|
| 446 |
|
---|
| 447 | // - stop the progress reporter as, one way or another, there will be no
|
---|
| 448 | // more progress
|
---|
| 449 | reporter.interrupt();
|
---|
| 450 | reporter = null; // force gc
|
---|
| 451 |
|
---|
| 452 | // - write end time to log
|
---|
| 453 | double end_time = ((double)System.currentTimeMillis())/1000;
|
---|
[28192] | 454 | fw1.write("[Completed:" + String.format("%.6f", end_time) + "]\n");
|
---|
| 455 | // - close our output to the log
|
---|
| 456 | fw1.close();
|
---|
[27910] | 457 | }
|
---|
| 458 | /** map(LongWritable,Text,Context) **/
|
---|
| 459 |
|
---|
| 460 | }
|
---|
| 461 | /** class GSMap **/
|
---|
| 462 |
|
---|
| 463 |
|
---|
| 464 | /** @class GSReducer
|
---|
| 465 | */
|
---|
| 466 | public static class GSReducer
|
---|
| 467 | extends Reducer<Text, Text, Text, Text>
|
---|
| 468 | {
|
---|
| 469 |
|
---|
| 470 |
|
---|
| 471 | /** Prepare the Reducer by looking up configuration for this collection to
|
---|
| 472 | * determine the appropriate tools to use to create databases.
|
---|
| 473 | */
|
---|
| 474 | public void setup (Context context)
|
---|
| 475 | {
|
---|
| 476 | }
|
---|
| 477 | /** setup() **/
|
---|
| 478 |
|
---|
| 479 |
|
---|
| 480 | /**
|
---|
| 481 | */
|
---|
| 482 | public void reduce(Text key, Iterable<Text> values, Context context)
|
---|
| 483 | throws IOException, InterruptedException
|
---|
| 484 | {
|
---|
| 485 | System.err.println("reduce(" + key.toString() + ", <values>, <context>)");
|
---|
| 486 | Configuration conf = context.getConfiguration();
|
---|
| 487 | String gsdl_home = conf.get("gsdlhome");
|
---|
| 488 | String collection = conf.get("collection");
|
---|
[28012] | 489 | String archives_dir = gsdl_home + "/collect/" + collection + "/archives";
|
---|
| 490 | //String archives_dir = conf.get("archivesdir");
|
---|
[27910] | 491 | // Eventually I'd like to read in database type from collect.cfg - or
|
---|
| 492 | // maybe have it passed in as part of the context - but I'll hardcode
|
---|
| 493 | // as GDBM for now as a proof of concept
|
---|
| 494 | String infodbtype = "tdb";
|
---|
| 495 |
|
---|
| 496 | String key_string = key.toString();
|
---|
| 497 | String[] key_parts = key_string.split(" ");
|
---|
| 498 | if (key_parts.length >= 2)
|
---|
| 499 | {
|
---|
| 500 | String type = key_parts[0];
|
---|
| 501 | String argument = key_parts[1];
|
---|
| 502 |
|
---|
| 503 | Iterator<Text> values_itr = values.iterator();
|
---|
| 504 | // There are basically five different cases based on the key's type
|
---|
| 505 | // - the first is datestamp... these are sorted earliest first, and
|
---|
| 506 | // since we only want the earliest we can ignore the rest!
|
---|
| 507 | if (type.equals("datestamp"))
|
---|
| 508 | {
|
---|
| 509 | Text earliest_datestamp = values_itr.next();
|
---|
| 510 | // Write this directly to file
|
---|
| 511 | try
|
---|
| 512 | {
|
---|
| 513 | FileWriter earliest_datestamp_fout = new FileWriter(archives_dir + "/earliestDatestamp");
|
---|
| 514 | earliest_datestamp_fout.write(earliest_datestamp.toString());
|
---|
| 515 | earliest_datestamp_fout.close();
|
---|
| 516 | }
|
---|
| 517 | catch (Exception ex)
|
---|
| 518 | {
|
---|
| 519 | ex.printStackTrace();
|
---|
| 520 | }
|
---|
| 521 | }
|
---|
| 522 | // For 'doc' types we open a pipe to the database creator and send all
|
---|
| 523 | // the values through for processing
|
---|
| 524 | else if (type.equals("doc"))
|
---|
| 525 | {
|
---|
| 526 | GSInfoDB archiveinf_doc = new GSInfoDB(gsdl_home, infodbtype, archives_dir + "/archiveinf-doc." + infodbtype);
|
---|
| 527 | while (values_itr.hasNext())
|
---|
| 528 | {
|
---|
| 529 | Text db_entry = values_itr.next();
|
---|
| 530 | archiveinf_doc.writeEntry(db_entry.toString());
|
---|
| 531 | }
|
---|
| 532 | archiveinf_doc.close();
|
---|
| 533 | }
|
---|
| 534 | // Similarly for 'src' types, except here we group all entries with the
|
---|
| 535 | // same key
|
---|
| 536 | else if (type.equals("src"))
|
---|
| 537 | {
|
---|
| 538 | // Sigh - you can only create this the TDB file on a local
|
---|
| 539 | // filesystem, so I have to create it here, and then move it into
|
---|
| 540 | // place when finished
|
---|
| 541 | GSInfoDB archiveinf_src = new GSInfoDB(gsdl_home, infodbtype, archives_dir + "/archiveinf-src." + infodbtype);
|
---|
| 542 | String current_file_path = "";
|
---|
| 543 | StringBuffer current_record = new StringBuffer();
|
---|
| 544 | Pattern file_path_pattern = Pattern.compile("(.*?)\\|(.*)");
|
---|
| 545 | while (values_itr.hasNext())
|
---|
| 546 | {
|
---|
| 547 | Text db_entry_raw = values_itr.next();
|
---|
| 548 | String db_entry = db_entry_raw.toString();
|
---|
| 549 | // Parse out the file path this entry refers to
|
---|
| 550 | Matcher file_path_matcher = file_path_pattern.matcher(db_entry);
|
---|
| 551 | if (file_path_matcher.matches())
|
---|
| 552 | {
|
---|
| 553 | String this_file_path = file_path_matcher.group(1);
|
---|
| 554 | String this_record = file_path_matcher.group(2);
|
---|
| 555 | // Output the record (if there is one) if the file path changes
|
---|
| 556 | if (!this_file_path.equals(current_file_path) && current_record.length() > 0)
|
---|
| 557 | {
|
---|
| 558 | archiveinf_src.writeEntry("[" + current_file_path + "]\n" + current_record.toString());
|
---|
| 559 | // store the next records details
|
---|
| 560 | current_file_path = this_file_path;
|
---|
| 561 | current_record = new StringBuffer(this_record);
|
---|
| 562 | }
|
---|
| 563 | // Append onto our growing record
|
---|
| 564 | else
|
---|
| 565 | {
|
---|
| 566 | current_file_path = this_file_path;
|
---|
| 567 | if (current_record.length() > 0)
|
---|
| 568 | {
|
---|
| 569 | current_record.append("\n");
|
---|
| 570 | }
|
---|
| 571 | current_record.append(this_record);
|
---|
| 572 | }
|
---|
| 573 | }
|
---|
| 574 | else
|
---|
| 575 | {
|
---|
| 576 | // Not a valid src entry?
|
---|
| 577 | }
|
---|
| 578 | }
|
---|
| 579 | if (!current_file_path.equals("") && current_record.length() > 0)
|
---|
| 580 | {
|
---|
| 581 | archiveinf_src.writeEntry("[" + current_file_path + "]\n" + current_record.toString());
|
---|
| 582 | }
|
---|
| 583 | archiveinf_src.close();
|
---|
| 584 | }
|
---|
| 585 | // For 'rss' we write all the entries - in order - to an XML file
|
---|
| 586 | else if (type.equals("rss"))
|
---|
| 587 | {
|
---|
| 588 | try
|
---|
| 589 | {
|
---|
| 590 | FileWriter rss_item_rdf = new FileWriter(archives_dir + "/rss-items.rdf");
|
---|
| 591 | while (values_itr.hasNext())
|
---|
| 592 | {
|
---|
| 593 | Text rss_entry = values_itr.next();
|
---|
| 594 | rss_item_rdf.write(rss_entry.toString() + "\n");
|
---|
| 595 | }
|
---|
| 596 | rss_item_rdf.close();
|
---|
| 597 | }
|
---|
| 598 | catch (Exception ex)
|
---|
| 599 | {
|
---|
| 600 | ex.printStackTrace();
|
---|
| 601 | }
|
---|
| 602 | }
|
---|
| 603 | // Everything else we assume are just process log messages - so get
|
---|
| 604 | // Hadoop to write in order to log (I may need to annotate these
|
---|
| 605 | // with the host so I can see which message came from which compute
|
---|
| 606 | // node).
|
---|
| 607 | else
|
---|
| 608 | {
|
---|
| 609 | Pattern file_path_pattern = Pattern.compile("([^ ]+) (.*)");
|
---|
| 610 | while (values_itr.hasNext())
|
---|
| 611 | {
|
---|
| 612 | String compound_value = values_itr.next().toString();
|
---|
| 613 | Pattern p = Pattern.compile("([^\\s]+) (.*)");
|
---|
| 614 | Matcher m = p.matcher(compound_value);
|
---|
| 615 | if (m.matches())
|
---|
| 616 | {
|
---|
| 617 | Text msg_key = new Text(m.group(1));
|
---|
| 618 | Text msg_value = new Text(m.group(2));
|
---|
| 619 | context.write(msg_key, msg_value);
|
---|
| 620 | }
|
---|
| 621 | }
|
---|
| 622 | }
|
---|
| 623 | }
|
---|
| 624 | else
|
---|
| 625 | {
|
---|
| 626 | System.err.println("Error! Failed to parse key: " + key.toString());
|
---|
| 627 | }
|
---|
| 628 | }
|
---|
| 629 | /** reduce(key, value, context) **/
|
---|
| 630 |
|
---|
| 631 | }
|
---|
| 632 | /** class GSReducer **/
|
---|
| 633 |
|
---|
| 634 | /** @function main
|
---|
| 635 | */
|
---|
| 636 | public static void main(String[] args)
|
---|
| 637 | throws Exception
|
---|
| 638 | {
|
---|
| 639 | if (args.length < 6)
|
---|
| 640 | {
|
---|
| 641 | System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n");
|
---|
| 642 | System.exit(0);
|
---|
| 643 | }
|
---|
| 644 |
|
---|
| 645 | Configuration conf = new Configuration();
|
---|
| 646 | conf.set("gsdlhome", args[0]);
|
---|
| 647 | conf.set("hadoophome", args[1]);
|
---|
| 648 | conf.set("collection", args[2]);
|
---|
| 649 | conf.set("archivesdir", args[3]);
|
---|
| 650 | conf.set("hdfsprefix", args[4]); // "HDThriftFS", "HDFSShell", or ""
|
---|
| 651 | conf.set("hdfsin", args[5]);
|
---|
| 652 | conf.set("hdfsout", args[6]);
|
---|
| 653 |
|
---|
| 654 | // Set the number of retries to 1 - hopefully one of the following will work
|
---|
| 655 | conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop
|
---|
| 656 | conf.setInt("mapreduce.map.maxattempts", 1); // Hadoop 2.0.3-alpha
|
---|
| 657 | conf.setInt("mapreduce.map.max.attempts", 1); // Solution on Web
|
---|
| 658 | // prevent timeouts
|
---|
| 659 | long milli_seconds = 4*60*60*1000; // 4 hour
|
---|
| 660 | conf.setLong("mapred.task.timeout", milli_seconds);
|
---|
| 661 |
|
---|
| 662 | Job job = new Job(conf, "hadoopgreenstoneingest");
|
---|
| 663 | job.setJarByClass(HadoopGreenstoneIngest2.class);
|
---|
| 664 |
|
---|
| 665 | job.setOutputKeyClass(Text.class);
|
---|
| 666 | job.setOutputValueClass(Text.class);
|
---|
| 667 |
|
---|
| 668 | // Register the map, combiner, and reducer classes
|
---|
| 669 | job.setMapperClass(GSMap.class);
|
---|
| 670 | job.setPartitionerClass(GSPartitioner.class);
|
---|
| 671 | job.setGroupingComparatorClass(GSGroupingComparator.class);
|
---|
| 672 | job.setReducerClass(GSReducer.class);
|
---|
| 673 |
|
---|
| 674 | // Sets the input and output handlers - may need to adjust input to provide me
|
---|
| 675 | // a series of filenames (TextInputFormat will instead read in a text file and
|
---|
| 676 | // return each line...)
|
---|
| 677 | job.setInputFormatClass(GSFileInputFormat.class);
|
---|
| 678 | //job.setOutputFormatClass(NullOutputFormat.class);
|
---|
| 679 | job.setOutputFormatClass(TextOutputFormat.class);
|
---|
| 680 |
|
---|
| 681 | // Register the input and output paths
|
---|
| 682 | // - this input path should be to a file (in HDFS) that lists the paths to
|
---|
| 683 | // the manifest files
|
---|
| 684 | FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin")));
|
---|
| 685 | // - for now the output isn't that important, but in the future I may use
|
---|
| 686 | // this mechanism to produce a time based log.
|
---|
| 687 | FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout")));
|
---|
| 688 |
|
---|
| 689 | // Recommended notation despite my hatiness of ?: syntax
|
---|
| 690 | System.exit(job.waitForCompletion(true)?0:1);
|
---|
| 691 | }
|
---|
| 692 | /** main(String[]) **/
|
---|
| 693 |
|
---|
| 694 | /** @function copyFile(File, File)
|
---|
| 695 | *
|
---|
| 696 | * @author Josh Froelich @ stackoverflow.com
|
---|
| 697 | */
|
---|
| 698 | public static void copyFile(File sourceFile, File destFile)
|
---|
| 699 | throws IOException
|
---|
| 700 | {
|
---|
| 701 | if(!destFile.exists())
|
---|
| 702 | {
|
---|
| 703 | destFile.createNewFile();
|
---|
| 704 | }
|
---|
| 705 | FileChannel source = null;
|
---|
| 706 | FileChannel destination = null;
|
---|
| 707 | try
|
---|
| 708 | {
|
---|
| 709 | source = new FileInputStream(sourceFile).getChannel();
|
---|
| 710 | destination = new FileOutputStream(destFile).getChannel();
|
---|
| 711 | destination.transferFrom(source, 0, source.size());
|
---|
| 712 | }
|
---|
| 713 | finally
|
---|
| 714 | {
|
---|
| 715 | if(source != null)
|
---|
| 716 | {
|
---|
| 717 | source.close();
|
---|
| 718 | }
|
---|
| 719 | if(destination != null)
|
---|
| 720 | {
|
---|
| 721 | destination.close();
|
---|
| 722 | }
|
---|
| 723 | }
|
---|
| 724 | }
|
---|
| 725 | /** copyFile(File, File) **/
|
---|
| 726 |
|
---|
| 727 |
|
---|
| 728 | /** @function runCommand()
|
---|
| 729 | *
|
---|
| 730 | * A convenience method that calls an external command and returns its
|
---|
| 731 | * standard out as a string. Warning! Not safe if the command could return a
|
---|
| 732 | * large amount of text in the STDERR stream - may infinitely block.
|
---|
| 733 | *
|
---|
| 734 | */
|
---|
| 735 | public static String runCommand(String command)
|
---|
| 736 | {
|
---|
| 737 | ///ystem.err.println("[DEBUG] command: " + command);
|
---|
| 738 | StringBuffer result = new StringBuffer();
|
---|
| 739 | try
|
---|
| 740 | {
|
---|
| 741 | Runtime run = Runtime.getRuntime() ;
|
---|
| 742 | Process pr = run.exec(command) ;
|
---|
| 743 | pr.waitFor() ;
|
---|
| 744 | BufferedReader buf = new BufferedReader( new InputStreamReader( pr.getInputStream() ) ) ;
|
---|
| 745 | String line;
|
---|
| 746 | while ( ( line = buf.readLine() ) != null )
|
---|
| 747 | {
|
---|
| 748 | result.append(line);
|
---|
| 749 | }
|
---|
| 750 | }
|
---|
| 751 | catch (Exception ex)
|
---|
| 752 | {
|
---|
| 753 | System.err.println("Error! " + ex.getMessage());
|
---|
| 754 | }
|
---|
| 755 | ///ystem.err.println("[DEBUG] result: " + result.toString());
|
---|
| 756 | return result.toString();
|
---|
| 757 | }
|
---|
| 758 | /** runCommand() **/
|
---|
| 759 | }
|
---|