Changeset 27641 for gs2-extensions/parallel-building/trunk
- Timestamp:
- 2013-06-18T10:12:53+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java
r27571 r27641 165 165 String gsdlhome = conf.get("gsdlhome"); 166 166 String hdfs_prefix = conf.get("hdfsprefix"); 167 String hadoop_ prefix = conf.get("hadoopprefix");167 String hadoop_home = conf.get("hadoophome"); 168 168 String collection = conf.get("collection"); 169 169 String task_id = conf.get("mapred.task.id"); 170 170 task_id = task_id.substring(8); // remove "attempt_" prefix 171 171 172 // Programatically rewrite the protocol as appropriate for the given 172 // archives directory 173 file_path = file_path.replace("hdfs://", hdfs_prefix); 173 // archives directory (not necessary if path is local or NFS) 174 if (hdfs_prefix.equals("/hdfs")) 175 { 176 file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix); 177 } 178 else 179 { 180 file_path = file_path.replace("hdfs://", hdfs_prefix); 181 } 182 174 183 // - create a temporary directory 175 184 File greenstone_tmp_dir = new File("/tmp/greenstone"); … … 178 187 greenstone_tmp_dir.mkdir(); 179 188 } 189 180 190 // - open a unique log file 181 191 File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log"); … … 220 230 221 231 // - call Greenstone passing in the path to the manifest 222 ProcessBuilder import_process_builder 223 = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", hdfs_prefix + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection); 232 ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection); 224 233 fw1.write("[Command:" + import_process_builder.command() + "]\n"); 225 234 // - alter environment … … 229 238 path = gsdlhome + "/ext/parallel-building/bin/script:" + path; 230 239 path = gsdlhome + "/ext/parallel-building/linux/bin:" + path; 231 path = hadoop_ prefix+ "/bin:" + path;240 path = hadoop_home + "/bin:" + path; 232 241 path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path; 233 242 path = gsdlhome + "/ext/tdb-edit/bin/script:" + path; … … 251 260 import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux"); 252 261 // - Hadoop specific 253 import_process_env.put("HADOOP_PREFIX", hadoop_ prefix);254 fw1.write("[HADOOP_PREFIX: " + hadoop_ prefix+ "]\n");262 import_process_env.put("HADOOP_PREFIX", hadoop_home); 263 fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n"); 255 264 256 265 // - change working directory … … 310 319 if (args.length < 6) 311 320 { 312 System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <h dfsprefix> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n");321 System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n"); 313 322 System.exit(0); 314 323 } 315 324 316 325 Configuration conf = new Configuration(); 317 conf.set("gsdlhome", args[0]); 318 conf.set("hdfsprefix", args[1]); // HDThriftFS or HDFSShell 319 conf.set("hadoopprefix", args[2]); 320 conf.set("collection", args[3]); 326 conf.set("gsdlhome", args[0]); 327 conf.set("hadoophome", args[1]); 328 conf.set("collection", args[2]); 329 conf.set("archivesdir", args[3]); 330 conf.set("hdfsprefix", args[4]); // "HDThriftFS", "HDFSShell", or "" 331 conf.set("hdfsin", args[5]); 332 conf.set("hdfsout", args[6]); 333 321 334 // Set the number of retries to 1 - hopefully one of the following will work 322 335 conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop … … 348 361 // - this input path should be to a file (in HDFS) that lists the paths to 349 362 // the manifest files 350 FileInputFormat.setInputPaths(job, new Path( args[4]));363 FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin"))); 351 364 // - for now the output isn't that important, but in the future I may use 352 365 // this mechanism to produce a time based log. 353 FileOutputFormat.setOutputPath(job, new Path( args[5]));366 FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout"))); 354 367 355 368 // Recommended notation despite my hatiness of ?: syntax
Note:
See TracChangeset
for help on using the changeset viewer.