Changeset 27413

Show
Ignore:
Timestamp:
24.05.2013 09:23:29 (6 years ago)
Author:
jmt12
Message:

Making the Hadoop processing aware that there are two different possible prefixes for HDFS based files (each using a different 'driver' under the new FileUtils? system)

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java

    r27102 r27413  
    164164      Configuration conf = context.getConfiguration(); 
    165165      String gsdlhome = conf.get("gsdlhome"); 
    166       String hdfs_host = conf.get("hdfshost"); 
    167       String hdfs_port = conf.get("hdfsport"); 
     166      String hdfs_prefix = conf.get("hdfsprefix"); 
    168167      String hadoop_prefix = conf.get("hadoopprefix"); 
    169168      String collection = conf.get("collection"); 
     
    199198      // - call Greenstone passing in the path to the manifest 
    200199      ProcessBuilder import_process_builder 
    201         = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", "hdfs://" + hdfs_host + ":" + hdfs_port + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection); 
     200        = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-verbosity", "42", "-archivedir", hdfs_prefix + "/user/jmt12/gsdl/collect/" + collection + "/archives", collection); 
    202201      fw1.write("[Command:" + import_process_builder.command() + "]\n"); 
    203202      // - alter environment 
     
    284283    if (args.length < 6) 
    285284    { 
    286       System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfs host> <hdfs port> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n"); 
     285      System.out.println("Usage: bin/hadoop jar hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest <gsdlhome> <hdfsprefix> <hadoop prefix> <collection> <hdfsin> <hdfsout>\n"); 
    287286      System.exit(0); 
    288287    } 
     
    290289    Configuration conf = new Configuration(); 
    291290    conf.set("gsdlhome", args[0]); 
    292     conf.set("hdfshost", args[1]); 
    293     conf.set("hdfsport", args[2]); 
    294     conf.set("hadoopprefix", args[3]); 
    295     conf.set("collection", args[4]); 
     291    conf.set("hdfsprefix", args[1]); // HDThriftFS or HDFSShell 
     292    conf.set("hadoopprefix", args[2]); 
     293    conf.set("collection", args[3]); 
    296294    // Set the number of retries to 1 - hopefully one of the following will work 
    297295    conf.setInt("mapred.map.max.attempts", 1); // Old Hadoop 
     
    323321    // - this input path should be to a file (in HDFS) that lists the paths to 
    324322    //   the manifest files 
    325     FileInputFormat.setInputPaths(job, new Path(args[5])); 
     323    FileInputFormat.setInputPaths(job, new Path(args[4])); 
    326324    // - for now the output isn't that important, but in the future I may use 
    327325    //   this mechanism to produce a time based log. 
    328     FileOutputFormat.setOutputPath(job, new Path(args[6])); 
     326    FileOutputFormat.setOutputPath(job, new Path(args[5])); 
    329327 
    330328    // Recommended notation despite my hatiness of ?: syntax