Changeset 27641

Show
Ignore:
Timestamp:
18.06.2013 10:12:53 (6 years ago)
Author:
jmt12
Message:

Altered order of arguments and allow archives dir to be passed as argument - both of which were needed to support NFS access to HDFS

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java

    r27571 r27641  
    165165      String gsdlhome = conf.get("gsdlhome"); 
    166166      String hdfs_prefix = conf.get("hdfsprefix"); 
    167       String hadoop_prefix = conf.get("hadoopprefix"); 
     167      String hadoop_home = conf.get("hadoophome"); 
    168168      String collection = conf.get("collection"); 
    169169      String task_id = conf.get("mapred.task.id"); 
    170170      task_id = task_id.substring(8); // remove "attempt_" prefix 
     171 
    171172      // Programatically rewrite the protocol as appropriate for the given 
    172       // archives directory 
    173       file_path = file_path.replace("hdfs://", hdfs_prefix); 
     173      // archives directory (not necessary if path is local or NFS) 
     174      if (hdfs_prefix.equals("/hdfs")) 
     175      { 
     176        file_path = file_path.replaceFirst("hdfs://[^/]*", hdfs_prefix); 
     177      } 
     178      else 
     179      { 
     180        file_path = file_path.replace("hdfs://", hdfs_prefix); 
     181      } 
     182 
    174183      // - create a temporary directory 
    175184      File greenstone_tmp_dir = new File("/tmp/greenstone"); 
     
    178187        greenstone_tmp_dir.mkdir(); 
    179188      } 
     189 
    180190      // - open a unique log file 
    181191      File import_process_log = new File("/tmp/greenstone/import-hadoop-" + task_id + ".log"); 
     
    220230 
    221231      // - call Greenstone passing in the path to the manifest 
    222       ProcessBuilder import_process_builder 
    223         = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", hdfs_prefix + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection); 
     232      ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection); 
    224233      fw1.write("[Command:" + import_process_builder.command() + "]\n"); 
    225234      // - alter environment 
     
    229238      path = gsdlhome + "/ext/parallel-building/bin/script:" + path; 
    230239      path = gsdlhome + "/ext/parallel-building/linux/bin:" + path; 
    231       path = hadoop_prefix + "/bin:" + path; 
     240      path = hadoop_home + "/bin:" + path; 
    232241      path = gsdlhome + "/ext/tdb-edit/linux/bin:" + path; 
    233242      path = gsdlhome + "/ext/tdb-edit/bin/script:" + path; 
     
    251260      import_process_env.put("GEXTVIDEO_INSTALLED", gsdlhome + "/ext/video-and-audio/linux"); 
    252261      // - Hadoop specific 
    253       import_process_env.put("HADOOP_PREFIX", hadoop_prefix); 
    254       fw1.write("[HADOOP_PREFIX: " + hadoop_prefix + "]\n"); 
     262      import_process_env.put("HADOOP_PREFIX", hadoop_home); 
     263      fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n"); 
    255264 
    256265      // - change working directory 
     
    310319    if (args.length < 6) 
    311320    { 
    312       System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfsprefix> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n"); 
     321      System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hadoop home> <collection> <archivesdir> <hdfsprefix> <hdfsin> <hdfsout>\n"); 
    313322      System.exit(0); 
    314323    } 
    315324 
    316325    Configuration conf = new Configuration(); 
    317     conf.set("gsdlhome", args[0]); 
    318     conf.set("hdfsprefix", args[1]); // HDThriftFS or HDFSShell 
    319     conf.set("hadoopprefix", args[2]); 
    320     conf.set("collection", args[3]); 
     326    conf.set("gsdlhome",    args[0]); 
     327    conf.set("hadoophome",  args[1]); 
     328    conf.set("collection",  args[2]); 
     329    conf.set("archivesdir", args[3]); 
     330    conf.set("hdfsprefix",  args[4]); // "HDThriftFS", "HDFSShell", or "" 
     331    conf.set("hdfsin",      args[5]); 
     332    conf.set("hdfsout",     args[6]); 
     333 
    321334    // Set the number of retries to 1 - hopefully one of the following will work 
    322335    conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop 
     
    348361    // - this input path should be to a file (in HDFS) that lists the paths to 
    349362    //   the manifest files 
    350     FileInputFormat.setInputPaths(job, new Path(args[4])); 
     363    FileInputFormat.setInputPaths(job, new Path(conf.get("hdfsin"))); 
    351364    // - for now the output isn't that important, but in the future I may use 
    352365    //   this mechanism to produce a time based log. 
    353     FileOutputFormat.setOutputPath(job, new Path(args[5])); 
     366    FileOutputFormat.setOutputPath(job, new Path(conf.get("hdfsout"))); 
    354367 
    355368    // Recommended notation despite my hatiness of ?: syntax