Changeset 26961


Ignore:
Timestamp:
2013-02-26T11:38:47+13:00 (11 years ago)
Author:
jmt12
Message:

Several file handling functions extended to be aware of and support HDFS files. This means noticing paths starting with hdfs:// protocol (and preserving that protocol through filename_cat() etc) and calling 'hadoop fs <action>' to handle them appropriately. The most tricksie functions are those that open filehandles and get file status

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/util.pm

    r24626 r26961  
    4646    # and are regular files
    4747    foreach my $file (@files) {
     48
     49      if (&util::isHDFS($file))
     50      {
     51        &util::executeHDFSCommand('rm', $file);
     52      }
     53      else
     54      {
    4855    if (!-e $file) {
    4956        print STDERR "util::rm $file does not exist\n";
     
    5360        push (@filefiles, $file);
    5461    }
    55     }
    56    
     62      }
     63    }
     64
    5765    # remove the files
    5866    my $numremoved = unlink @filefiles;
     
    96104    # recursively remove the files
    97105    foreach my $file (@files_array) {
     106
     107      # HDFS support
     108      if (&util::isHDFS($file))
     109      {
     110        # HDFS doesn't really lend itself to a choosy delete, unless you want
     111        # it to be really, really, unbearably slow.
     112        &util::executeHDFSCommand('rmr', $file);
     113        next;
     114      }
     115
     116
    98117    $file =~ s/[\/\\]+$//; # remove trailing slashes
    99118   
     
    133152
    134153# recursive removal
    135 sub rm_r {
    136     my (@files) = @_;
    137    
    138     # use the more general (but reterospectively written function
    139     # filtered_rm_r function()
    140 
    141     filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
     154sub rm_r
     155{
     156  my (@files) = @_;
     157  # use the more general (but reterospectively written function
     158  # filtered_rm_r function()
     159  filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
    142160}
    143161
     
    149167    my $dest = pop (@_);
    150168    my (@srcfiles) = @_;
     169
     170    # moving a file within or into HDFS
     171    if (&util::isHDFS($dest))
     172    {
     173      foreach my $src (@srcfiles)
     174      {
     175        if (&util::isHDFS($src))
     176        {
     177          &util::executeHDFSCommand('mv', $src, $dest);
     178        }
     179        else
     180        {
     181          &util::executeHDFSCommand('put', $src, $dest);
     182          &util::rm_r($src);
     183        }
     184      }
     185      return;
     186    }
    151187
    152188    # remove trailing slashes from source and destination files
     
    166202    # move the files
    167203    foreach my $file (@srcfiles) {
     204
     205      # moving a file out of HDFS
     206      if (&util::isHDFS($file))
     207      {
     208        &util::executeHDFSCommand('get', $file, $dest);
     209        &util::rm_r($file);
     210        next;
     211      }
     212
    168213    my $tempdest = $dest;
    169214    if (-d $tempdest) {
     
    174219        print STDERR "util::mv $file does not exist\n";
    175220    } else {
    176         rename ($file, $tempdest);
    177     }
    178     }
     221        if(!rename ($file, $tempdest)) {
     222        print STDERR "**** Failed to rename $file to $tempdest\n";
     223        &File::Copy::copy($file, $tempdest);
     224        &rm($file);
     225        }
     226        elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
     227        #print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
     228        if(!-e $tempdest) {
     229            print STDERR "@@@@ ERROR: $tempdest does not exist\n";
     230        }
     231        # Sometimes the rename operation fails (as does File::Copy::move).
     232        # This turns out to be because the files are hardlinked.
     233        # Need to do a copy-delete in this case, however, the copy step is not necessary:
     234        # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
     235        #&File::Copy::copy($file, $tempdest);
     236
     237        &rm($file);     
     238        }
     239    }
     240    }
     241}
     242
     243# Move the contents of source directory into target directory
     244# (as opposed to merely replacing target dir with the src dir)
     245# This can overwrite any files with duplicate names in the target
     246# but other files and folders in the target will continue to exist
     247sub mv_dir_contents {
     248    my ($src_dir, $dest_dir) = @_;
     249   
     250    # Obtain listing of all files within src_dir
     251    # Note that readdir lists relative paths, as well as . and ..
     252    opendir(DIR, "$src_dir");
     253    my @files= readdir(DIR);
     254    close(DIR);
     255   
     256    my @full_path_files = ();
     257    foreach my $file (@files) {
     258        # process all except . and ..
     259        unless($file eq "." || $file eq "..") {
     260           
     261            my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
     262       
     263            # construct absolute paths
     264            $file = &filename_cat($src_dir, $file); # $file is now an absolute path
     265           
     266            # Recurse on directories which have an equivalent in target dest_dir
     267            # If $file is a directory that already exists in target $dest_dir,
     268            # then a simple move operation will fail (definitely on Windows).
     269            if(-d $file && -d $dest_subdir) {
     270                #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
     271                &mv_dir_contents($file, $dest_subdir);
     272               
     273                # now all content is moved across, delete empty dir in source folder
     274                if(&is_dir_empty($file)) {
     275                    if (!rmdir $file) {
     276                        print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
     277                    }
     278                } else { # error
     279                    print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
     280                }
     281            } else { # process files and any directories that don't already exist with a simple move
     282                push(@full_path_files, $file);
     283            }           
     284        }
     285    }
     286   
     287    if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
     288        &mk_dir($dest_dir);
     289    }
     290
     291    #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n";
     292
     293    if(@full_path_files) { # if non-empty, there's something to copy across
     294        &mv(@full_path_files, $dest_dir);
     295    }
    179296}
    180297
    181298
    182299# copies a file or a group of files
    183 sub cp {
    184     my $dest = pop (@_);
    185     my (@srcfiles) = @_;
    186 
    187     # remove trailing slashes from source and destination files
    188     $dest =~ s/[\\\/]+$//;
    189     map {$_ =~ s/[\\\/]+$//;} @srcfiles;
    190 
    191     # a few sanity checks
    192     if (scalar (@srcfiles) == 0) {
    193     print STDERR "util::cp no destination directory given\n";
    194     return;
    195     } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
    196     print STDERR "util::cp if multiple source files are given the ".
    197         "destination must be a directory\n";
    198     return;
    199     }
    200 
    201     # copy the files
    202     foreach my $file (@srcfiles) {
    203     my $tempdest = $dest;
    204     if (-d $tempdest) {
    205         my ($filename) = $file =~ /([^\\\/]+)$/;
    206         $tempdest .= "/$filename";
    207     }
    208     if (!-e $file) {
    209         print STDERR "util::cp $file does not exist\n";
    210     } elsif (!-f $file) {
    211         print STDERR "util::cp $file is not a plain file\n";
    212     } else {
    213         &File::Copy::copy ($file, $tempdest);
    214     }
    215     }
     300sub cp
     301{
     302  my $dest = pop (@_);
     303  my (@srcfiles) = @_;
     304
     305  # remove trailing slashes from source and destination files
     306  $dest =~ s/[\\\/]+$//;
     307  map {$_ =~ s/[\\\/]+$//;} @srcfiles;
     308
     309  # a few sanity checks
     310  if (scalar (@srcfiles) == 0)
     311  {
     312    print STDERR "util::cp no destination directory given\n";
     313    return 0;
     314  }
     315  elsif ((scalar (@srcfiles) > 1) && (!&util::dir_exists($dest)))
     316  {
     317    print STDERR "util::cp if multiple source files are given the destination must be a directory\n";
     318    return 0;
     319  }
     320
     321  # copying a file into or within HDFS
     322  if (&util::isHDFS($dest))
     323  {
     324    foreach my $src (@srcfiles)
     325    {
     326      &util::executeHDFSCommand('put', $src, $dest);
     327      &util::rm_r($src);
     328    }
     329    return;
     330  }
     331
     332  # copy the files
     333  foreach my $file (@srcfiles)
     334  {
     335    my $tempdest = $dest;
     336    if (&util::dir_exists($tempdest))
     337    {
     338      my ($filename) = $file =~ /([^\\\/]+)$/;
     339      $tempdest .= "/$filename";
     340    }
     341    if (!&util::file_exists($file))
     342    {
     343      if (&util::dir_exists($file))
     344      {
     345        print STDERR "util::cp $file is not a plain file\n";
     346      }
     347      else
     348      {
     349        print STDERR "util::cp $file does not exist\n";
     350      }
     351    }
     352    elsif (&util::isHDFS($file))
     353    {
     354      &util::executeHDFSCommand('get', $file, $dest);
     355    }
     356    else
     357    {
     358      &File::Copy::copy ($file, $tempdest);
     359    }
     360  }
    216361}
    217362
     
    393538}
    394539
    395 sub mk_dir {
    396     my ($dir) = @_;
    397 
     540# /** @function mk_dir()
     541#  *  Extend mkdir to allow it to silently fail in the case where code is
     542#  *  'competing' to create a directory first.
     543#  *  @param   $dir  the full path of the directory to create
     544#  *  @param   $can_fail  1 if the mkdir can fail silently (optional)
     545#  *  @return  1 on success, 0 on failure
     546#  */
     547sub mk_dir
     548{
     549  my ($dir, $can_fail) = @_;
     550  my $mkdir_ok = 0;
     551  if (&util::isHDFS($dir))
     552  {
     553    # unhelpfully HDFS mkdir returns 0 on success, -1 on failure
     554    my $result = &util::executeHDFSCommand('mkdir', $dir);
     555    if ($result == 0)
     556    {
     557      $mkdir_ok = 1;
     558    }
     559  }
     560  else
     561  {
    398562    my $store_umask = umask(0002);
    399563    my $mkdir_ok = mkdir ($dir, 0777);
    400564    umask($store_umask);
    401    
    402     if (!$mkdir_ok)
    403     {
    404     print STDERR "util::mk_dir could not create directory $dir\n";
    405     return;
    406     }
     565  }
     566  # only output an error if this call wasn't marked as can_fail
     567  if (!$mkdir_ok && (!defined $can_fail || !$can_fail))
     568  {
     569    print STDERR "util::mk_dir could not create directory: $dir\n error: $!\n";
     570  }
     571  return $mkdir_ok;
    407572}
    408573
     
    410575# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
    411576# slightly faster (surprisingly) - Stefan.
    412 sub mk_all_dir {
    413     my ($dir) = @_;
    414 
    415     # use / for the directory separator, remove duplicate and
    416     # trailing slashes
    417     $dir=~s/[\\\/]+/\//g;
    418     $dir=~s/[\\\/]+$//;
    419 
    420     # make sure the cache directory exists
    421     my $dirsofar = "";
    422     my $first = 1;
    423     foreach my $dirname (split ("/", $dir)) {
    424     $dirsofar .= "/" unless $first;
    425     $first = 0;
    426 
    427     $dirsofar .= $dirname;
    428 
    429     next if $dirname =~ /^(|[a-z]:)$/i;
    430     if (!-e $dirsofar)
    431         {
    432         my $store_umask = umask(0002);
    433         my $mkdir_ok = mkdir ($dirsofar, 0777);
    434         umask($store_umask);
    435         if (!$mkdir_ok)
    436         {
    437             print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
    438             return;
    439         }
    440         }
    441     }
     577sub mk_all_dir
     578{
     579  my ($dir) = @_;
     580
     581  ###rint "-> util::mk_all_dir($dir)\n";
     582
     583  # support for HDFS
     584  if (&util::isHDFS($dir))
     585  {
     586    # HDFS's version of mkdir does it recursively anyway
     587    my $result = &util::executeHDFSCommand('mkdir', $dir);
     588    return ($result == 0);
     589  }
     590
     591  # use / for the directory separator, remove duplicate and
     592  # trailing slashes
     593  $dir=~s/[\\\/]+/\//g;
     594  $dir=~s/[\\\/]+$//;
     595
     596  # ensure the directory doesn't already exist
     597  if (-e $dir)
     598  {
     599    return 0;
     600  }
     601
     602  # make sure the cache directory exists
     603  my $dirsofar = "";
     604  my $first = 1;
     605  foreach my $dirname (split ("/", $dir))
     606  {
     607    $dirsofar .= "/" unless $first;
     608    $first = 0;
     609
     610    $dirsofar .= $dirname;
     611
     612    next if $dirname =~ /^(|[a-z]:)$/i;
     613    if (!-e $dirsofar)
     614    {
     615      my $store_umask = umask(0002);
     616      my $mkdir_ok = mkdir ($dirsofar, 0777);
     617      umask($store_umask);
     618      if (!$mkdir_ok)
     619      {
     620        print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
     621        return 0;
     622      }
     623    }
     624  }
     625  return (-e $dir);
    442626}
    443627
     
    446630    my ($src, $dest, $verbosity) = @_;
    447631
     632    print "&util::hard_link( $src, $dest, $verbosity)\n";
     633
    448634    # remove trailing slashes from source and destination files
    449635    $src =~ s/[\\\/]+$//;
     
    452638##    print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
    453639    # a few sanity checks
    454     if (-e $dest) {
     640    if (&util::file_exists($dest)) {
    455641    # destination file already exists
    456642    return;
    457643    }
    458     elsif (!-e $src) {
     644    elsif (!&util::file_exists($src)) {
    459645    print STDERR "util::hard_link source file \"$src\" does not exist\n";
    460646    return 1;
    461647    }
    462     elsif (-d $src) {
     648    elsif (!&util::file_exists($src) && &util::dir_exists($src)) {
    463649    print STDERR "util::hard_link source \"$src\" is a directory\n";
    464650    return 1;
     
    466652
    467653    my $dest_dir = &File::Basename::dirname($dest);
    468     mk_all_dir($dest_dir) if (!-e $dest_dir);
    469 
     654    if (!&util::dir_exists($dest_dir))
     655    {
     656      mk_all_dir($dest_dir);
     657    }
     658
     659    # HDFS Support - we can't ever link, copy instead
     660    if (&util::isHDFS($src))
     661    {
     662      if (&util::isHDFS($dest))
     663      {
     664        &util::executeHDFSCommand('put', $src, $dest);
     665        return 0;
     666      }
     667      else
     668      {
     669        &util::executeHDFSCommand('get', $src, $dest);
     670        return 0;
     671      }
     672    }
     673    elsif (&util::isHDFS($dest))
     674    {
     675      &util::executeHDFSCommand('put', $src, $dest);
     676      return 0;
     677    }
    470678
    471679    if (!link($src, $dest)) {
     
    553761}
    554762
    555 
    556763sub fd_exists
    557764{
     
    563770
    564771    my $exists = 0;
     772
     773    # Support for HDFS [jmt12]
     774    if (&util::isHDFS($filename_full_path))
     775    {
     776      # very limited test op support for HDFS
     777      if ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z')
     778      {
     779        $test_op = '-e';
     780      }
     781      my $result = &util::executeHDFSCommand('test ' . $test_op, $filename_full_path);
     782      return ($result == 0);
     783    }
    565784
    566785    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     
    587806{
    588807    my ($filename_full_path) = @_;
    589 
     808#    if ($filename_full_path =~ /^hdfs:/)
     809#    {
     810#      print "-> util::file_exists(" . $filename_full_path . ")\n";
     811#    }
    590812    return fd_exists($filename_full_path,"-f");
    591813}
     
    594816{
    595817    my ($filename_full_path) = @_;
    596 
     818#    if ($filename_full_path =~ /^hdfs:/)
     819#    {
     820#      print "-> util::dir_exists(" . $filename_full_path . ")\n";
     821#    }
    597822    return fd_exists($filename_full_path,"-d");
    598823}
     
    8371062    my $filename = shift (@_);
    8381063
    839     # need to put single backslash back to double so that regex works
    840     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    841     $filename =~ s/\\/\\\\/g;
    842     }
     1064    # need to make single backslashes double so that regex works
     1065    $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);   
     1066   
     1067    # note that the first part of a substitution is a regex, so RE chars need to be escaped,
     1068    # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
     1069    $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
     1070    $filename =~ s@\(@\\(@g; # escape brackets
     1071    $filename =~ s@\)@\\)@g; # escape brackets
     1072    $filename =~ s@\[@\\[@g; # escape brackets
     1073    $filename =~ s@\]@\\]@g; # escape brackets
     1074   
     1075    return $filename;
     1076}
     1077
     1078sub unregex_filename {
     1079    my $filename = shift (@_);
     1080
     1081    # need to put doubled backslashes for regex back to single
     1082    $filename =~ s/\\\./\./g; # remove RE syntax for .
     1083    $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
     1084    $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
     1085    $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
     1086    $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
     1087   
     1088    # \\ goes to \
     1089    # This is the last step in reverse mirroring the order of steps in filename_to_regex()
     1090    $filename =~ s/\\\\/\\/g; # remove RE syntax for \   
    8431091    return $filename;
    8441092}
     
    8691117    $filename =~ s/[\\\/]$//;
    8701118
     1119    # restore protocols if present [jmt12]
     1120    $filename =~ s/(file|hdfs|https?):\/([^\/])/$1:\/\/$2/g;
     1121
    8711122    return $filename;
    8721123}
     
    9021153}
    9031154
    904 my $oid_warned_about_periods = 0;
    9051155
    9061156sub tidy_up_oid {
    907     my ($OID, $verbosity) = @_;
    908     if (!defined $verbosity)
    909     {
    910       $verbosity = 2;
    911     }
     1157    my ($OID) = @_;
    9121158    if ($OID =~ /\./) {
    913       if (!$oid_warned_about_periods)
    914       {
    915         print STDERR "Warning, identifier $OID contains periods (.), removing them for this and future documents\n";
    916         $oid_warned_about_periods = 1;
    917       }
     1159    print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
    9181160    $OID =~ s/\.//g; #remove any periods
    9191161    }
     
    9351177
    9361178    # do not prepend any value/path that's already in the environment variable
    937     if ($ENV{'GSDLOS'} =~ /^windows$/i)
    938     {
    939     my $escaped_val = $val;
    940     $escaped_val =~ s/\\/\\\\/g; # escape any Windows backslashes for upcoming regex
    941     if (!defined($ENV{$var})) {
    942         $ENV{$var} = "$val";
    943     }
    944     elsif($ENV{$var} !~ m/$escaped_val/) {
    945         $ENV{$var} = "$val;".$ENV{$var};
    946     }
    947     }
    948     else {
    949     if (!defined($ENV{$var})) {
    950         $ENV{$var} = "$val";
    951     }
    952     elsif($ENV{$var} !~ m/$val/) {
    953         $ENV{$var} = "$val:".$ENV{$var};
    954     }
     1179   
     1180    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
     1181    if (!defined($ENV{$var})) {
     1182    $ENV{$var} = "$val";
     1183    }
     1184    elsif($ENV{$var} !~ m/$escaped_val/) {
     1185    $ENV{$var} = "$val;".$ENV{$var};
    9551186    }
    9561187}
     
    9581189sub envvar_append {
    9591190    my ($var,$val) = @_;
    960 
     1191   
    9611192    # do not append any value/path that's already in the environment variable
    962     if ($ENV{'GSDLOS'} =~ /^windows$/i)
    963     {
    964     my $escaped_val = $val;
    965     $escaped_val =~ s/\\/\\\\/g; # escape any Windows backslashes for upcoming regex
    966     if (!defined($ENV{$var})) {
    967         $ENV{$var} = "$val";
    968     }
    969     elsif($ENV{$var} !~ m/$escaped_val/) {
    970         $ENV{$var} .= ";$val";
    971     }
    972     }
    973     else {
    974     if (!defined($ENV{$var})) {
    975         $ENV{$var} = "$val";
    976     }
    977     elsif($ENV{$var} !~ m/$val/) {
    978         $ENV{$var} .= ":$val";
    979     }
    980     }   
     1193   
     1194    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
     1195    if (!defined($ENV{$var})) {
     1196    $ENV{$var} = "$val";
     1197    }
     1198    elsif($ENV{$var} !~ m/$escaped_val/) {
     1199    $ENV{$var} .= ";$val";
     1200    }
    9811201}
    9821202
     
    10471267}
    10481268
    1049 
     1269# If filename is relative to within_dir, returns the relative path of filename to that directory
     1270# with slashes in the filename returned as they were in the original (absolute) filename.
    10501271sub filename_within_directory
    10511272{
     
    10571278    }
    10581279   
    1059     $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator
    1060 
     1280    $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets   
    10611281    if ($filename =~ m/^$within_dir(.*)$/) {
    10621282    $filename = $1;
     
    10651285    return $filename;
    10661286}
     1287
     1288# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
     1289# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
     1290# The subpath returned will also be a URL type filename.
     1291sub filename_within_directory_url_format
     1292{
     1293    my ($filename,$within_dir) = @_;
     1294   
     1295    # convert parameters only to / slashes if Windows
     1296   
     1297    my $filename_urlformat = &filepath_to_url_format($filename);
     1298    my $within_dir_urlformat = &filepath_to_url_format($within_dir);
     1299
     1300    #if ($within_dir_urlformat !~ m/\/$/) {
     1301        # make sure directory ends with a slash
     1302        #$within_dir_urlformat .= "/";
     1303    #}
     1304   
     1305    my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
     1306   
     1307    #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
     1308   
     1309    # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
     1310    if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
     1311        $filename_urlformat = $1;
     1312    }
     1313   
     1314    return $filename_urlformat;
     1315}
     1316
     1317# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
     1318# since on Linux it doesn't represent a file separator but an escape char).
     1319sub filepath_to_url_format
     1320{
     1321    my ($filepath) = @_;
     1322    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     1323        # Only need to worry about Windows, as Unix style directories already in url-format
     1324        # Convert Windows style \ => /
     1325        $filepath =~ s@\\@/@g;     
     1326    }
     1327    return $filepath;
     1328}
     1329
     1330# regex filepaths on windows may include \\ as path separator. Convert \\ to /
     1331sub filepath_regex_to_url_format
     1332{
     1333    my ($filepath) = @_;
     1334    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     1335    # Only need to worry about Windows, as Unix style directories already in url-format
     1336    # Convert Windows style \\ => /
     1337    $filepath =~ s@\\\\@/@g;       
     1338    }
     1339    return $filepath;
     1340   
     1341}
     1342
     1343# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
     1344# and ignores trailing /
     1345# returns (file, dirs) dirs will be empty if no subdirs
     1346sub url_fileparse
     1347{
     1348    my ($filepath) = @_;
     1349    # remove trailing /
     1350    $filepath =~ s@/$@@;
     1351    if ($filepath !~ m@/@) {
     1352    return ($filepath, "");
     1353    }
     1354    my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
     1355    return ($file, $dirs);
     1356   
     1357}
     1358
    10671359
    10681360sub filename_within_collection
     
    11851477{
    11861478    my ($filename) = @_;
     1479
     1480    # support for explicit protocol prefixes, for example file: or hdfs:,
     1481    # which must be absolute [jmt12]
     1482    if ($filename =~ /^\w+:\/\//)
     1483    {
     1484      return 1;
     1485    }
    11871486
    11881487    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     
    15021801}
    15031802
     1803sub os_dir() {
     1804   
     1805    my $gsdlarch = "";
     1806    if(defined $ENV{'GSDLARCH'}) {
     1807    $gsdlarch = $ENV{'GSDLARCH'};
     1808    }
     1809    return $ENV{'GSDLOS'}.$gsdlarch;
     1810}
    15041811
    15051812# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
     
    15191826    } else {
    15201827    $defaultUrlprefix = "/greenstone";
    1521     $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "gsdlsite.cfg");
     1828    $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
    15221829    push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
    15231830    }
     
    16431950}
    16441951
     1952# returns the path to the java command in the JRE included with GS (if any),
     1953# quoted to safeguard any spaces in this path, otherwise a simple java
     1954# command is returned which assumes and will try for a system java.
     1955sub get_java_command {
     1956    my $java = "java";
     1957    if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
     1958                               # after running setup.bat or from GLI which also runs setup.bat
     1959    my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
     1960    if(-d $java_bin) {
     1961        $java = &util::filename_cat($java_bin,"java");
     1962        $java = "\"".$java."\""; # quoted to preserve spaces in path
     1963    }
     1964    }
     1965    return $java;
     1966}
     1967
     1968
     1969# Given the qualified collection name (colgroup/collection),
     1970# returns the collection and colgroup parts
     1971sub get_collection_parts {
     1972    # http://perldoc.perl.org/File/Basename.html
     1973    # my($filename, $directories, $suffix) = fileparse($path);
     1974    # "$directories contains everything up to and including the last directory separator in the $path
     1975    # including the volume (if applicable). The remainder of the $path is the $filename."
     1976    #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);   
     1977
     1978    my $qualified_collection = shift(@_);
     1979
     1980    # Since activate.pl can be launched from the command-line, including by a user,
     1981    # best not to assume colgroup uses URL-style slashes as would be the case with GLI
     1982    # Also allow for the accidental inclusion of multiple slashes
     1983    my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
     1984   
     1985    if(!defined $collection) {
     1986        $collection = $colgroup;
     1987        $colgroup = "";
     1988    }
     1989    return ($collection, $colgroup);
     1990}
     1991
     1992# work out the "collectdir/collection" location
     1993sub resolve_collection_dir {
     1994    my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
     1995   
     1996    my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);   
     1997   
     1998    if (defined $collect_dir) {
     1999        return &util::filename_cat($collect_dir,$colgroup, $collection);
     2000    }
     2001    else {
     2002        if (defined $site) {
     2003            return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
     2004        }
     2005        else {
     2006            return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
     2007        }
     2008    }
     2009}
     2010
     2011# ====== John's new fuctions to support HDFS ======
     2012
     2013# /**
     2014#  *  Executes a HDFS command without caring about the resulting output while
     2015#  *  still reacting appropriately to failed executions.
     2016#  */
     2017sub executeHDFSCommand
     2018{
     2019  my $action = shift(@_);
     2020  my $command = 'hadoop fs -' . $action . ' "' . join('" "', @_) . '"';
     2021  my $result = `$command 2>&1`;
     2022  my $return_value = $?;
     2023  ###rint STDERR "-> util::executeHDFSCommand('" . $command . "') => " . $return_value . "\n";
     2024  return $return_value;
     2025}
     2026
     2027# /** @function file_canread()
     2028#  */
     2029sub file_canread
     2030{
     2031  my ($filename_full_path) = @_;
     2032  # the HDFS support doesn't have '-r' so it will revert to '-e'
     2033  return fd_exists($filename_full_path,"-r");
     2034}
     2035# /** file_canread() **/
     2036
     2037sub file_openfdcommand
     2038{
     2039  my ($filename_full_path, $mode) = @_;
     2040  ##rint STDERR "-> util::file_openfdcommand('" . $filename_full_path . "', '" . $mode . "')\n";
     2041  # I'll set to read by default, as that is less destructive to precious files
     2042  # on your system...
     2043  if (!defined $mode)
     2044  {
     2045    $mode = '<';
     2046  }
     2047  my $open_fd_command = $mode . $filename_full_path;
     2048  if (&util::isHDFS($filename_full_path))
     2049  {
     2050    # currently don't really support append, but might be able to do something
     2051    # like:
     2052    # hadoop fs -cat /user/username/folder/csv1.csv \
     2053    #   /user/username/folder/csv2.csv | hadoop fs -put - \
     2054    #   /user/username/folder/output.csv
     2055    if ($mode eq '>>' || $mode eq '>')
     2056    {
     2057      # if the file already exists, put won't clobber it like a proper write
     2058      # would, so try to delete it (can fail)
     2059      if (&util::file_exists($filename_full_path))
     2060      {
     2061        &util::rm($filename_full_path);
     2062      }
     2063      # then create the command
     2064      $open_fd_command = '| ' . &util::generateHDFSCommand('put', '-', $filename_full_path);
     2065    }
     2066    else
     2067    {
     2068      $open_fd_command = &util::generateHDFSCommand('cat', $filename_full_path) . ' |';
     2069    }
     2070  }
     2071  return $open_fd_command;
     2072}
     2073
     2074# /** @function file_readdir()
     2075#  *  Provide a function to return the files within a directory that is aware
     2076#  *  of protocols other than file://
     2077#  *  @param $dirname  the full path to the directory
     2078#  *  @param $dir_ref  a reference to an array to populate with files
     2079#  */
     2080sub file_readdir
     2081{
     2082  my ($dirname, $dir_ref) = @_;
     2083  my $dir_read = 0;
     2084  if (&util::isHDFS($dirname))
     2085  {
     2086    my $hdfs_command = &util::generateHDFSCommand('ls', $dirname);
     2087    my $result = `$hdfs_command 2>&1`;
     2088    my @lines = split(/\r?\n/, $result);
     2089    foreach my $line (@lines)
     2090    {
     2091      if ($line =~ /\/([^\/]+)$/)
     2092      {
     2093        my $file = $1;
     2094        push(@{$dir_ref}, $file);
     2095      }
     2096    }
     2097    $dir_read = 1;
     2098  }
     2099  elsif (opendir(DIR, $dirname))
     2100  {
     2101    my @dirs = readdir(DIR);
     2102    push(@{$dir_ref}, @dirs);
     2103    closedir(DIR);
     2104    $dir_read = 1;
     2105  }
     2106  return $dir_read;
     2107}
     2108# /** file_readdir() **/
     2109
     2110# /**
     2111#  */
     2112sub file_lastmodified
     2113{
     2114  my ($filename_full_path) = @_;
     2115  my $last_modified = 0;
     2116  if (&util::isHDFS($filename_full_path))
     2117  {
     2118    my $file_stats = file_stats($filename_full_path);
     2119    my $mod_date = $file_stats->{'modification_date'};
     2120    my $mod_time = $file_stats->{'modification_time'};
     2121    # Last modified should be in number of days (as a float) since last
     2122    # modified - but I'll just return 0 for now
     2123    $last_modified = 0.00;
     2124  }
     2125  else
     2126  {
     2127    $last_modified = -M $filename_full_path;
     2128  }
     2129  return $last_modified;
     2130}
     2131# /** file_lastmodified() **/
     2132
     2133# /** @function file_size
     2134#  *  Replacement for "-s" in Greenstone buildtime, as we need a version that
     2135#  *  HDFS aware
     2136#  */
     2137sub file_size
     2138{
     2139  my ($filename_full_path) = @_;
     2140  my $size = 0;
     2141  if (&util::isHDFS($filename_full_path))
     2142  {
     2143    my $file_stats = file_stats($filename_full_path);
     2144    $size = $file_stats->{'filesize'};
     2145  }
     2146  else
     2147  {
     2148    $size = -s $filename_full_path;
     2149  }
     2150  return $size;
     2151}
     2152# /** file_size() **/
     2153
     2154sub file_stats
     2155{
     2156  my ($filename_full_path) = @_;
     2157  my $stats = {};
     2158  if (&util::isHDFS($filename_full_path))
     2159  {
     2160    # - LS is the only way to get these details from HDFS (-stat doesn't
     2161    #   provide enough information)
     2162    my $hdfs_command = &util::generateHDFSCommand('ls', $filename_full_path);
     2163    my $result = `$hdfs_command 2>&1`;
     2164    # - parse the results
     2165    if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/)
     2166    {
     2167      $stats->{'filename'} = $8;
     2168      $stats->{'replicas'} = $2;
     2169      $stats->{'filesize'} = $5;
     2170      $stats->{'modification_date'} = $6;
     2171      $stats->{'modification_time'} = $7;
     2172      $stats->{'permissions'} = $1;
     2173      $stats->{'userid'} = $3;
     2174      $stats->{'groupid'} = $4;
     2175    }
     2176    else
     2177    {
     2178      die("Error! Failed to parse HDFS ls result: " . $result . "\n");
     2179    }
     2180  }
     2181  return $stats;
     2182}
     2183# /** file_stats() **/
     2184
     2185# /**
     2186#  */
     2187sub generateHDFSCommand
     2188{
     2189  my $flags = shift(@_);
     2190  return 'hadoop fs -' . $flags . ' "' . join('" "', @_) . '"';
     2191}
     2192# /** generateHDFSCommand() **/
     2193
     2194# /** @function isHDFS()
     2195#  *  Determine if the given path exists within a HDFS system by checking for
     2196#  *  the expected protocol prefix.
     2197#  *  @param $full_path  the path to check
     2198#  *  @return 1 if within HDFS, 0 otherwise
     2199#  */
     2200sub isHDFS
     2201{
     2202  my ($full_path) = @_;
     2203  return (lc(substr($full_path, 0, 7)) eq 'hdfs://');
     2204}
     2205# /** isHDFS() **/
    16452206
    164622071;
Note: See TracChangeset for help on using the changeset viewer.