Changeset 26961

Show
Ignore:
Timestamp:
26.02.2013 11:38:47 (7 years ago)
Author:
jmt12
Message:

Several file handling functions extended to be aware of and support HDFS files. This means noticing paths starting with  hdfs:// protocol (and preserving that protocol through filename_cat() etc) and calling 'hadoop fs <action>' to handle them appropriately. The most tricksie functions are those that open filehandles and get file status

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/util.pm

    r24626 r26961  
    4646    # and are regular files 
    4747    foreach my $file (@files) { 
     48 
     49      if (&util::isHDFS($file)) 
     50      { 
     51        &util::executeHDFSCommand('rm', $file); 
     52      } 
     53      else 
     54      { 
    4855    if (!-e $file) { 
    4956        print STDERR "util::rm $file does not exist\n"; 
     
    5360        push (@filefiles, $file); 
    5461    } 
    55     } 
    56      
     62      } 
     63    } 
     64 
    5765    # remove the files 
    5866    my $numremoved = unlink @filefiles; 
     
    96104    # recursively remove the files 
    97105    foreach my $file (@files_array) { 
     106 
     107      # HDFS support 
     108      if (&util::isHDFS($file)) 
     109      { 
     110        # HDFS doesn't really lend itself to a choosy delete, unless you want 
     111        # it to be really, really, unbearably slow. 
     112        &util::executeHDFSCommand('rmr', $file); 
     113        next; 
     114      } 
     115 
     116 
    98117    $file =~ s/[\/\\]+$//; # remove trailing slashes 
    99118     
     
    133152 
    134153# recursive removal 
    135 sub rm_r { 
    136     my (@files) = @_; 
    137      
    138     # use the more general (but reterospectively written function 
    139     # filtered_rm_r function() 
    140  
    141     filtered_rm_r(\@files,undef,undef); # no accept or reject expressions 
     154sub rm_r 
     155{ 
     156  my (@files) = @_; 
     157  # use the more general (but reterospectively written function 
     158  # filtered_rm_r function() 
     159  filtered_rm_r(\@files,undef,undef); # no accept or reject expressions 
    142160} 
    143161 
     
    149167    my $dest = pop (@_); 
    150168    my (@srcfiles) = @_; 
     169 
     170    # moving a file within or into HDFS 
     171    if (&util::isHDFS($dest)) 
     172    { 
     173      foreach my $src (@srcfiles) 
     174      { 
     175        if (&util::isHDFS($src)) 
     176        { 
     177          &util::executeHDFSCommand('mv', $src, $dest); 
     178        } 
     179        else 
     180        { 
     181          &util::executeHDFSCommand('put', $src, $dest); 
     182          &util::rm_r($src); 
     183        } 
     184      } 
     185      return; 
     186    } 
    151187 
    152188    # remove trailing slashes from source and destination files 
     
    166202    # move the files 
    167203    foreach my $file (@srcfiles) { 
     204 
     205      # moving a file out of HDFS 
     206      if (&util::isHDFS($file)) 
     207      { 
     208        &util::executeHDFSCommand('get', $file, $dest); 
     209        &util::rm_r($file); 
     210        next; 
     211      } 
     212 
    168213    my $tempdest = $dest; 
    169214    if (-d $tempdest) { 
     
    174219        print STDERR "util::mv $file does not exist\n"; 
    175220    } else { 
    176         rename ($file, $tempdest); 
    177     } 
    178     } 
     221        if(!rename ($file, $tempdest)) { 
     222        print STDERR "**** Failed to rename $file to $tempdest\n"; 
     223        &File::Copy::copy($file, $tempdest); 
     224        &rm($file); 
     225        }  
     226        elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename 
     227        #print STDERR "*** srcfile $file still exists after rename to $tempdest\n"; 
     228        if(!-e $tempdest) { 
     229            print STDERR "@@@@ ERROR: $tempdest does not exist\n"; 
     230        } 
     231        # Sometimes the rename operation fails (as does File::Copy::move). 
     232        # This turns out to be because the files are hardlinked.  
     233        # Need to do a copy-delete in this case, however, the copy step is not necessary:  
     234        # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it. 
     235        #&File::Copy::copy($file, $tempdest); 
     236 
     237        &rm($file);      
     238        } 
     239    } 
     240    } 
     241} 
     242 
     243# Move the contents of source directory into target directory 
     244# (as opposed to merely replacing target dir with the src dir) 
     245# This can overwrite any files with duplicate names in the target 
     246# but other files and folders in the target will continue to exist 
     247sub mv_dir_contents { 
     248    my ($src_dir, $dest_dir) = @_; 
     249     
     250    # Obtain listing of all files within src_dir 
     251    # Note that readdir lists relative paths, as well as . and .. 
     252    opendir(DIR, "$src_dir"); 
     253    my @files= readdir(DIR);  
     254    close(DIR); 
     255     
     256    my @full_path_files = (); 
     257    foreach my $file (@files) {  
     258        # process all except . and .. 
     259        unless($file eq "." || $file eq "..") { 
     260             
     261            my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path 
     262         
     263            # construct absolute paths 
     264            $file = &filename_cat($src_dir, $file); # $file is now an absolute path 
     265             
     266            # Recurse on directories which have an equivalent in target dest_dir 
     267            # If $file is a directory that already exists in target $dest_dir,  
     268            # then a simple move operation will fail (definitely on Windows). 
     269            if(-d $file && -d $dest_subdir) {  
     270                #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n"; 
     271                &mv_dir_contents($file, $dest_subdir); 
     272                 
     273                # now all content is moved across, delete empty dir in source folder 
     274                if(&is_dir_empty($file)) { 
     275                    if (!rmdir $file) { 
     276                        print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n"; 
     277                    }  
     278                } else { # error 
     279                    print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n"; 
     280                } 
     281            } else { # process files and any directories that don't already exist with a simple move 
     282                push(@full_path_files, $file); 
     283            }            
     284        } 
     285    } 
     286     
     287    if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist 
     288        &mk_dir($dest_dir); 
     289    } 
     290 
     291    #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n"; 
     292 
     293    if(@full_path_files) { # if non-empty, there's something to copy across 
     294        &mv(@full_path_files, $dest_dir); 
     295    } 
    179296} 
    180297 
    181298 
    182299# copies a file or a group of files 
    183 sub cp { 
    184     my $dest = pop (@_); 
    185     my (@srcfiles) = @_; 
    186  
    187     # remove trailing slashes from source and destination files 
    188     $dest =~ s/[\\\/]+$//; 
    189     map {$_ =~ s/[\\\/]+$//;} @srcfiles; 
    190  
    191     # a few sanity checks 
    192     if (scalar (@srcfiles) == 0) { 
    193     print STDERR "util::cp no destination directory given\n"; 
    194     return; 
    195     } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) { 
    196     print STDERR "util::cp if multiple source files are given the ". 
    197         "destination must be a directory\n"; 
    198     return; 
    199     } 
    200  
    201     # copy the files 
    202     foreach my $file (@srcfiles) { 
    203     my $tempdest = $dest; 
    204     if (-d $tempdest) { 
    205         my ($filename) = $file =~ /([^\\\/]+)$/; 
    206         $tempdest .= "/$filename"; 
    207     } 
    208     if (!-e $file) { 
    209         print STDERR "util::cp $file does not exist\n"; 
    210     } elsif (!-f $file) { 
    211         print STDERR "util::cp $file is not a plain file\n"; 
    212     } else { 
    213         &File::Copy::copy ($file, $tempdest); 
    214     } 
    215     } 
     300sub cp 
     301{ 
     302  my $dest = pop (@_); 
     303  my (@srcfiles) = @_; 
     304 
     305  # remove trailing slashes from source and destination files 
     306  $dest =~ s/[\\\/]+$//; 
     307  map {$_ =~ s/[\\\/]+$//;} @srcfiles; 
     308 
     309  # a few sanity checks 
     310  if (scalar (@srcfiles) == 0) 
     311  { 
     312    print STDERR "util::cp no destination directory given\n"; 
     313    return 0; 
     314  } 
     315  elsif ((scalar (@srcfiles) > 1) && (!&util::dir_exists($dest))) 
     316  { 
     317    print STDERR "util::cp if multiple source files are given the destination must be a directory\n"; 
     318    return 0; 
     319  } 
     320 
     321  # copying a file into or within HDFS 
     322  if (&util::isHDFS($dest)) 
     323  { 
     324    foreach my $src (@srcfiles) 
     325    { 
     326      &util::executeHDFSCommand('put', $src, $dest); 
     327      &util::rm_r($src); 
     328    } 
     329    return; 
     330  } 
     331 
     332  # copy the files 
     333  foreach my $file (@srcfiles) 
     334  { 
     335    my $tempdest = $dest; 
     336    if (&util::dir_exists($tempdest)) 
     337    { 
     338      my ($filename) = $file =~ /([^\\\/]+)$/; 
     339      $tempdest .= "/$filename"; 
     340    } 
     341    if (!&util::file_exists($file)) 
     342    { 
     343      if (&util::dir_exists($file)) 
     344      { 
     345        print STDERR "util::cp $file is not a plain file\n"; 
     346      } 
     347      else 
     348      { 
     349        print STDERR "util::cp $file does not exist\n"; 
     350      } 
     351    } 
     352    elsif (&util::isHDFS($file)) 
     353    { 
     354      &util::executeHDFSCommand('get', $file, $dest); 
     355    } 
     356    else 
     357    { 
     358      &File::Copy::copy ($file, $tempdest); 
     359    } 
     360  } 
    216361} 
    217362 
     
    393538} 
    394539 
    395 sub mk_dir { 
    396     my ($dir) = @_; 
    397  
     540# /** @function mk_dir() 
     541#  *  Extend mkdir to allow it to silently fail in the case where code is 
     542#  *  'competing' to create a directory first. 
     543#  *  @param   $dir  the full path of the directory to create 
     544#  *  @param   $can_fail  1 if the mkdir can fail silently (optional) 
     545#  *  @return  1 on success, 0 on failure 
     546#  */ 
     547sub mk_dir 
     548{ 
     549  my ($dir, $can_fail) = @_; 
     550  my $mkdir_ok = 0; 
     551  if (&util::isHDFS($dir)) 
     552  { 
     553    # unhelpfully HDFS mkdir returns 0 on success, -1 on failure 
     554    my $result = &util::executeHDFSCommand('mkdir', $dir); 
     555    if ($result == 0) 
     556    { 
     557      $mkdir_ok = 1; 
     558    } 
     559  } 
     560  else 
     561  { 
    398562    my $store_umask = umask(0002); 
    399563    my $mkdir_ok = mkdir ($dir, 0777); 
    400564    umask($store_umask); 
    401      
    402     if (!$mkdir_ok)  
    403     { 
    404     print STDERR "util::mk_dir could not create directory $dir\n"; 
    405     return; 
    406     } 
     565  } 
     566  # only output an error if this call wasn't marked as can_fail 
     567  if (!$mkdir_ok && (!defined $can_fail || !$can_fail)) 
     568  { 
     569    print STDERR "util::mk_dir could not create directory: $dir\n error: $!\n"; 
     570  } 
     571  return $mkdir_ok; 
    407572} 
    408573 
     
    410575# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently 
    411576# slightly faster (surprisingly) - Stefan. 
    412 sub mk_all_dir { 
    413     my ($dir) = @_; 
    414  
    415     # use / for the directory separator, remove duplicate and 
    416     # trailing slashes 
    417     $dir=~s/[\\\/]+/\//g;  
    418     $dir=~s/[\\\/]+$//; 
    419  
    420     # make sure the cache directory exists 
    421     my $dirsofar = ""; 
    422     my $first = 1; 
    423     foreach my $dirname (split ("/", $dir)) { 
    424     $dirsofar .= "/" unless $first; 
    425     $first = 0; 
    426  
    427     $dirsofar .= $dirname; 
    428  
    429     next if $dirname =~ /^(|[a-z]:)$/i; 
    430     if (!-e $dirsofar) 
    431         { 
    432         my $store_umask = umask(0002); 
    433         my $mkdir_ok = mkdir ($dirsofar, 0777); 
    434         umask($store_umask); 
    435         if (!$mkdir_ok) 
    436         { 
    437             print STDERR "util::mk_all_dir could not create directory $dirsofar\n"; 
    438             return; 
    439         } 
    440         } 
    441     } 
     577sub mk_all_dir 
     578{ 
     579  my ($dir) = @_; 
     580 
     581  ###rint "-> util::mk_all_dir($dir)\n"; 
     582 
     583  # support for HDFS 
     584  if (&util::isHDFS($dir)) 
     585  { 
     586    # HDFS's version of mkdir does it recursively anyway 
     587    my $result = &util::executeHDFSCommand('mkdir', $dir); 
     588    return ($result == 0); 
     589  } 
     590 
     591  # use / for the directory separator, remove duplicate and 
     592  # trailing slashes 
     593  $dir=~s/[\\\/]+/\//g; 
     594  $dir=~s/[\\\/]+$//; 
     595 
     596  # ensure the directory doesn't already exist 
     597  if (-e $dir) 
     598  { 
     599    return 0; 
     600  } 
     601 
     602  # make sure the cache directory exists 
     603  my $dirsofar = ""; 
     604  my $first = 1; 
     605  foreach my $dirname (split ("/", $dir)) 
     606  { 
     607    $dirsofar .= "/" unless $first; 
     608    $first = 0; 
     609 
     610    $dirsofar .= $dirname; 
     611 
     612    next if $dirname =~ /^(|[a-z]:)$/i; 
     613    if (!-e $dirsofar) 
     614    { 
     615      my $store_umask = umask(0002); 
     616      my $mkdir_ok = mkdir ($dirsofar, 0777); 
     617      umask($store_umask); 
     618      if (!$mkdir_ok) 
     619      { 
     620        print STDERR "util::mk_all_dir could not create directory $dirsofar\n"; 
     621        return 0; 
     622      } 
     623    } 
     624  } 
     625  return (-e $dir); 
    442626} 
    443627 
     
    446630    my ($src, $dest, $verbosity) = @_; 
    447631 
     632    print "&util::hard_link( $src, $dest, $verbosity)\n"; 
     633 
    448634    # remove trailing slashes from source and destination files 
    449635    $src =~ s/[\\\/]+$//; 
     
    452638##    print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n"; 
    453639    # a few sanity checks 
    454     if (-e $dest) { 
     640    if (&util::file_exists($dest)) { 
    455641    # destination file already exists 
    456642    return; 
    457643    } 
    458     elsif (!-e $src) { 
     644    elsif (!&util::file_exists($src)) { 
    459645    print STDERR "util::hard_link source file \"$src\" does not exist\n"; 
    460646    return 1; 
    461647    } 
    462     elsif (-d $src) { 
     648    elsif (!&util::file_exists($src) && &util::dir_exists($src)) { 
    463649    print STDERR "util::hard_link source \"$src\" is a directory\n"; 
    464650    return 1; 
     
    466652 
    467653    my $dest_dir = &File::Basename::dirname($dest); 
    468     mk_all_dir($dest_dir) if (!-e $dest_dir); 
    469  
     654    if (!&util::dir_exists($dest_dir)) 
     655    { 
     656      mk_all_dir($dest_dir); 
     657    } 
     658 
     659    # HDFS Support - we can't ever link, copy instead 
     660    if (&util::isHDFS($src)) 
     661    { 
     662      if (&util::isHDFS($dest)) 
     663      { 
     664        &util::executeHDFSCommand('put', $src, $dest); 
     665        return 0; 
     666      } 
     667      else 
     668      { 
     669        &util::executeHDFSCommand('get', $src, $dest); 
     670        return 0; 
     671      } 
     672    } 
     673    elsif (&util::isHDFS($dest)) 
     674    { 
     675      &util::executeHDFSCommand('put', $src, $dest); 
     676      return 0; 
     677    } 
    470678 
    471679    if (!link($src, $dest)) { 
     
    553761} 
    554762 
    555  
    556763sub fd_exists 
    557764{ 
     
    563770 
    564771    my $exists = 0; 
     772 
     773    # Support for HDFS [jmt12] 
     774    if (&util::isHDFS($filename_full_path)) 
     775    { 
     776      # very limited test op support for HDFS 
     777      if ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z') 
     778      { 
     779        $test_op = '-e'; 
     780      } 
     781      my $result = &util::executeHDFSCommand('test ' . $test_op, $filename_full_path); 
     782      return ($result == 0); 
     783    } 
    565784 
    566785    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     
    587806{ 
    588807    my ($filename_full_path) = @_; 
    589  
     808#    if ($filename_full_path =~ /^hdfs:/) 
     809#    { 
     810#      print "-> util::file_exists(" . $filename_full_path . ")\n"; 
     811#    } 
    590812    return fd_exists($filename_full_path,"-f"); 
    591813} 
     
    594816{ 
    595817    my ($filename_full_path) = @_; 
    596  
     818#    if ($filename_full_path =~ /^hdfs:/) 
     819#    { 
     820#      print "-> util::dir_exists(" . $filename_full_path . ")\n"; 
     821#    } 
    597822    return fd_exists($filename_full_path,"-d"); 
    598823} 
     
    8371062    my $filename = shift (@_); 
    8381063 
    839     # need to put single backslash back to double so that regex works 
    840     if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
    841     $filename =~ s/\\/\\\\/g; 
    842     } 
     1064    # need to make single backslashes double so that regex works 
     1065    $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);     
     1066     
     1067    # note that the first part of a substitution is a regex, so RE chars need to be escaped, 
     1068    # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally 
     1069    $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them 
     1070    $filename =~ s@\(@\\(@g; # escape brackets 
     1071    $filename =~ s@\)@\\)@g; # escape brackets 
     1072    $filename =~ s@\[@\\[@g; # escape brackets 
     1073    $filename =~ s@\]@\\]@g; # escape brackets 
     1074     
     1075    return $filename; 
     1076} 
     1077 
     1078sub unregex_filename { 
     1079    my $filename = shift (@_); 
     1080 
     1081    # need to put doubled backslashes for regex back to single 
     1082    $filename =~ s/\\\./\./g; # remove RE syntax for . 
     1083    $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "(" 
     1084    $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")" 
     1085    $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "[" 
     1086    $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]" 
     1087     
     1088    # \\ goes to \ 
     1089    # This is the last step in reverse mirroring the order of steps in filename_to_regex() 
     1090    $filename =~ s/\\\\/\\/g; # remove RE syntax for \     
    8431091    return $filename; 
    8441092} 
     
    8691117    $filename =~ s/[\\\/]$//; 
    8701118 
     1119    # restore protocols if present [jmt12] 
     1120    $filename =~ s/(file|hdfs|https?):\/([^\/])/$1:\/\/$2/g; 
     1121 
    8711122    return $filename; 
    8721123} 
     
    9021153} 
    9031154 
    904 my $oid_warned_about_periods = 0; 
    9051155 
    9061156sub tidy_up_oid { 
    907     my ($OID, $verbosity) = @_; 
    908     if (!defined $verbosity) 
    909     { 
    910       $verbosity = 2; 
    911     } 
     1157    my ($OID) = @_; 
    9121158    if ($OID =~ /\./) { 
    913       if (!$oid_warned_about_periods) 
    914       { 
    915         print STDERR "Warning, identifier $OID contains periods (.), removing them for this and future documents\n"; 
    916         $oid_warned_about_periods = 1; 
    917       } 
     1159    print STDERR "Warning, identifier $OID contains periods (.), removing them\n"; 
    9181160    $OID =~ s/\.//g; #remove any periods 
    9191161    } 
     
    9351177 
    9361178    # do not prepend any value/path that's already in the environment variable 
    937     if ($ENV{'GSDLOS'} =~ /^windows$/i)  
    938     { 
    939     my $escaped_val = $val; 
    940     $escaped_val =~ s/\\/\\\\/g; # escape any Windows backslashes for upcoming regex 
    941     if (!defined($ENV{$var})) { 
    942         $ENV{$var} = "$val"; 
    943     } 
    944     elsif($ENV{$var} !~ m/$escaped_val/) {  
    945         $ENV{$var} = "$val;".$ENV{$var}; 
    946     } 
    947     } 
    948     else { 
    949     if (!defined($ENV{$var})) { 
    950         $ENV{$var} = "$val"; 
    951     } 
    952     elsif($ENV{$var} !~ m/$val/) {  
    953         $ENV{$var} = "$val:".$ENV{$var}; 
    954     } 
     1179     
     1180    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex 
     1181    if (!defined($ENV{$var})) { 
     1182    $ENV{$var} = "$val"; 
     1183    } 
     1184    elsif($ENV{$var} !~ m/$escaped_val/) {  
     1185    $ENV{$var} = "$val;".$ENV{$var}; 
    9551186    } 
    9561187} 
     
    9581189sub envvar_append { 
    9591190    my ($var,$val) = @_; 
    960  
     1191     
    9611192    # do not append any value/path that's already in the environment variable 
    962     if ($ENV{'GSDLOS'} =~ /^windows$/i)  
    963     { 
    964     my $escaped_val = $val; 
    965     $escaped_val =~ s/\\/\\\\/g; # escape any Windows backslashes for upcoming regex 
    966     if (!defined($ENV{$var})) { 
    967         $ENV{$var} = "$val"; 
    968     } 
    969     elsif($ENV{$var} !~ m/$escaped_val/) {  
    970         $ENV{$var} .= ";$val"; 
    971     } 
    972     } 
    973     else { 
    974     if (!defined($ENV{$var})) { 
    975         $ENV{$var} = "$val"; 
    976     } 
    977     elsif($ENV{$var} !~ m/$val/) {  
    978         $ENV{$var} .= ":$val"; 
    979     } 
    980     }     
     1193     
     1194    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex 
     1195    if (!defined($ENV{$var})) { 
     1196    $ENV{$var} = "$val"; 
     1197    } 
     1198    elsif($ENV{$var} !~ m/$escaped_val/) {  
     1199    $ENV{$var} .= ";$val"; 
     1200    } 
    9811201} 
    9821202 
     
    10471267} 
    10481268 
    1049  
     1269# If filename is relative to within_dir, returns the relative path of filename to that directory 
     1270# with slashes in the filename returned as they were in the original (absolute) filename. 
    10501271sub filename_within_directory 
    10511272{ 
     
    10571278    } 
    10581279     
    1059     $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator 
    1060  
     1280    $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets    
    10611281    if ($filename =~ m/^$within_dir(.*)$/) { 
    10621282    $filename = $1; 
     
    10651285    return $filename; 
    10661286} 
     1287 
     1288# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format. 
     1289# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes). 
     1290# The subpath returned will also be a URL type filename. 
     1291sub filename_within_directory_url_format 
     1292{ 
     1293    my ($filename,$within_dir) = @_; 
     1294     
     1295    # convert parameters only to / slashes if Windows 
     1296     
     1297    my $filename_urlformat = &filepath_to_url_format($filename); 
     1298    my $within_dir_urlformat = &filepath_to_url_format($within_dir); 
     1299 
     1300    #if ($within_dir_urlformat !~ m/\/$/) { 
     1301        # make sure directory ends with a slash 
     1302        #$within_dir_urlformat .= "/"; 
     1303    #} 
     1304     
     1305    my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets 
     1306     
     1307    #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n"; 
     1308     
     1309    # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath) 
     1310    if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) { 
     1311        $filename_urlformat = $1; 
     1312    } 
     1313     
     1314    return $filename_urlformat; 
     1315} 
     1316 
     1317# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is, 
     1318# since on Linux it doesn't represent a file separator but an escape char). 
     1319sub filepath_to_url_format 
     1320{ 
     1321    my ($filepath) = @_; 
     1322    if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     1323        # Only need to worry about Windows, as Unix style directories already in url-format 
     1324        # Convert Windows style \ => / 
     1325        $filepath =~ s@\\@/@g;       
     1326    } 
     1327    return $filepath; 
     1328} 
     1329 
     1330# regex filepaths on windows may include \\ as path separator. Convert \\ to / 
     1331sub filepath_regex_to_url_format 
     1332{ 
     1333    my ($filepath) = @_; 
     1334    if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     1335    # Only need to worry about Windows, as Unix style directories already in url-format 
     1336    # Convert Windows style \\ => / 
     1337    $filepath =~ s@\\\\@/@g;         
     1338    } 
     1339    return $filepath; 
     1340     
     1341} 
     1342 
     1343# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep) 
     1344# and ignores trailing / 
     1345# returns (file, dirs) dirs will be empty if no subdirs 
     1346sub url_fileparse 
     1347{ 
     1348    my ($filepath) = @_; 
     1349    # remove trailing / 
     1350    $filepath =~ s@/$@@; 
     1351    if ($filepath !~ m@/@) { 
     1352    return ($filepath, ""); 
     1353    }  
     1354    my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@; 
     1355    return ($file, $dirs); 
     1356     
     1357} 
     1358 
    10671359 
    10681360sub filename_within_collection 
     
    11851477{ 
    11861478    my ($filename) = @_; 
     1479 
     1480    # support for explicit protocol prefixes, for example file: or hdfs:, 
     1481    # which must be absolute [jmt12] 
     1482    if ($filename =~ /^\w+:\/\//) 
     1483    { 
     1484      return 1; 
     1485    } 
    11871486 
    11881487    if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     
    15021801} 
    15031802 
     1803sub os_dir() { 
     1804     
     1805    my $gsdlarch = ""; 
     1806    if(defined $ENV{'GSDLARCH'}) { 
     1807    $gsdlarch = $ENV{'GSDLARCH'}; 
     1808    } 
     1809    return $ENV{'GSDLOS'}.$gsdlarch; 
     1810} 
    15041811 
    15051812# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.  
     
    15191826    } else { 
    15201827    $defaultUrlprefix = "/greenstone"; 
    1521     $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", "gsdlsite.cfg"); 
     1828    $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg"); 
    15221829    push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other  
    15231830    } 
     
    16431950} 
    16441951 
     1952# returns the path to the java command in the JRE included with GS (if any), 
     1953# quoted to safeguard any spaces in this path, otherwise a simple java 
     1954# command is returned which assumes and will try for a system java. 
     1955sub get_java_command { 
     1956    my $java = "java"; 
     1957    if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line  
     1958                               # after running setup.bat or from GLI which also runs setup.bat 
     1959    my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin"); 
     1960    if(-d $java_bin) { 
     1961        $java = &util::filename_cat($java_bin,"java"); 
     1962        $java = "\"".$java."\""; # quoted to preserve spaces in path 
     1963    } 
     1964    } 
     1965    return $java; 
     1966} 
     1967 
     1968 
     1969# Given the qualified collection name (colgroup/collection),  
     1970# returns the collection and colgroup parts 
     1971sub get_collection_parts { 
     1972    # http://perldoc.perl.org/File/Basename.html 
     1973    # my($filename, $directories, $suffix) = fileparse($path); 
     1974    # "$directories contains everything up to and including the last directory separator in the $path  
     1975    # including the volume (if applicable). The remainder of the $path is the $filename." 
     1976    #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);    
     1977 
     1978    my $qualified_collection = shift(@_);  
     1979 
     1980    # Since activate.pl can be launched from the command-line, including by a user,  
     1981    # best not to assume colgroup uses URL-style slashes as would be the case with GLI 
     1982    # Also allow for the accidental inclusion of multiple slashes 
     1983    my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection); 
     1984     
     1985    if(!defined $collection) { 
     1986        $collection = $colgroup; 
     1987        $colgroup = ""; 
     1988    } 
     1989    return ($collection, $colgroup); 
     1990} 
     1991 
     1992# work out the "collectdir/collection" location 
     1993sub resolve_collection_dir { 
     1994    my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode 
     1995     
     1996    my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);    
     1997     
     1998    if (defined $collect_dir) { 
     1999        return &util::filename_cat($collect_dir,$colgroup, $collection); 
     2000    } 
     2001    else { 
     2002        if (defined $site) { 
     2003            return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection); 
     2004        } 
     2005        else { 
     2006            return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection); 
     2007        } 
     2008    } 
     2009} 
     2010 
     2011# ====== John's new fuctions to support HDFS ====== 
     2012 
     2013# /** 
     2014#  *  Executes a HDFS command without caring about the resulting output while 
     2015#  *  still reacting appropriately to failed executions. 
     2016#  */ 
     2017sub executeHDFSCommand 
     2018{ 
     2019  my $action = shift(@_); 
     2020  my $command = 'hadoop fs -' . $action . ' "' . join('" "', @_) . '"'; 
     2021  my $result = `$command 2>&1`; 
     2022  my $return_value = $?; 
     2023  ###rint STDERR "-> util::executeHDFSCommand('" . $command . "') => " . $return_value . "\n"; 
     2024  return $return_value; 
     2025} 
     2026 
     2027# /** @function file_canread() 
     2028#  */ 
     2029sub file_canread 
     2030{ 
     2031  my ($filename_full_path) = @_; 
     2032  # the HDFS support doesn't have '-r' so it will revert to '-e' 
     2033  return fd_exists($filename_full_path,"-r"); 
     2034} 
     2035# /** file_canread() **/ 
     2036 
     2037sub file_openfdcommand 
     2038{ 
     2039  my ($filename_full_path, $mode) = @_; 
     2040  ##rint STDERR "-> util::file_openfdcommand('" . $filename_full_path . "', '" . $mode . "')\n"; 
     2041  # I'll set to read by default, as that is less destructive to precious files 
     2042  # on your system... 
     2043  if (!defined $mode) 
     2044  { 
     2045    $mode = '<'; 
     2046  } 
     2047  my $open_fd_command = $mode . $filename_full_path; 
     2048  if (&util::isHDFS($filename_full_path)) 
     2049  { 
     2050    # currently don't really support append, but might be able to do something 
     2051    # like: 
     2052    # hadoop fs -cat /user/username/folder/csv1.csv \ 
     2053    #   /user/username/folder/csv2.csv | hadoop fs -put - \ 
     2054    #   /user/username/folder/output.csv 
     2055    if ($mode eq '>>' || $mode eq '>') 
     2056    { 
     2057      # if the file already exists, put won't clobber it like a proper write 
     2058      # would, so try to delete it (can fail) 
     2059      if (&util::file_exists($filename_full_path)) 
     2060      { 
     2061        &util::rm($filename_full_path); 
     2062      } 
     2063      # then create the command 
     2064      $open_fd_command = '| ' . &util::generateHDFSCommand('put', '-', $filename_full_path); 
     2065    } 
     2066    else 
     2067    { 
     2068      $open_fd_command = &util::generateHDFSCommand('cat', $filename_full_path) . ' |'; 
     2069    } 
     2070  } 
     2071  return $open_fd_command; 
     2072} 
     2073 
     2074# /** @function file_readdir() 
     2075#  *  Provide a function to return the files within a directory that is aware 
     2076#  *  of protocols other than file:// 
     2077#  *  @param $dirname  the full path to the directory 
     2078#  *  @param $dir_ref  a reference to an array to populate with files 
     2079#  */ 
     2080sub file_readdir 
     2081{ 
     2082  my ($dirname, $dir_ref) = @_; 
     2083  my $dir_read = 0; 
     2084  if (&util::isHDFS($dirname)) 
     2085  { 
     2086    my $hdfs_command = &util::generateHDFSCommand('ls', $dirname); 
     2087    my $result = `$hdfs_command 2>&1`; 
     2088    my @lines = split(/\r?\n/, $result); 
     2089    foreach my $line (@lines) 
     2090    { 
     2091      if ($line =~ /\/([^\/]+)$/) 
     2092      { 
     2093        my $file = $1; 
     2094        push(@{$dir_ref}, $file); 
     2095      } 
     2096    } 
     2097    $dir_read = 1; 
     2098  } 
     2099  elsif (opendir(DIR, $dirname)) 
     2100  { 
     2101    my @dirs = readdir(DIR); 
     2102    push(@{$dir_ref}, @dirs); 
     2103    closedir(DIR); 
     2104    $dir_read = 1; 
     2105  } 
     2106  return $dir_read; 
     2107} 
     2108# /** file_readdir() **/ 
     2109 
     2110# /** 
     2111#  */ 
     2112sub file_lastmodified 
     2113{ 
     2114  my ($filename_full_path) = @_; 
     2115  my $last_modified = 0; 
     2116  if (&util::isHDFS($filename_full_path)) 
     2117  { 
     2118    my $file_stats = file_stats($filename_full_path); 
     2119    my $mod_date = $file_stats->{'modification_date'}; 
     2120    my $mod_time = $file_stats->{'modification_time'}; 
     2121    # Last modified should be in number of days (as a float) since last 
     2122    # modified - but I'll just return 0 for now 
     2123    $last_modified = 0.00; 
     2124  } 
     2125  else 
     2126  { 
     2127    $last_modified = -M $filename_full_path; 
     2128  } 
     2129  return $last_modified; 
     2130} 
     2131# /** file_lastmodified() **/ 
     2132 
     2133# /** @function file_size 
     2134#  *  Replacement for "-s" in Greenstone buildtime, as we need a version that 
     2135#  *  HDFS aware 
     2136#  */ 
     2137sub file_size 
     2138{ 
     2139  my ($filename_full_path) = @_; 
     2140  my $size = 0; 
     2141  if (&util::isHDFS($filename_full_path)) 
     2142  { 
     2143    my $file_stats = file_stats($filename_full_path); 
     2144    $size = $file_stats->{'filesize'}; 
     2145  } 
     2146  else 
     2147  { 
     2148    $size = -s $filename_full_path; 
     2149  } 
     2150  return $size; 
     2151} 
     2152# /** file_size() **/ 
     2153 
     2154sub file_stats 
     2155{ 
     2156  my ($filename_full_path) = @_; 
     2157  my $stats = {}; 
     2158  if (&util::isHDFS($filename_full_path)) 
     2159  { 
     2160    # - LS is the only way to get these details from HDFS (-stat doesn't 
     2161    #   provide enough information) 
     2162    my $hdfs_command = &util::generateHDFSCommand('ls', $filename_full_path); 
     2163    my $result = `$hdfs_command 2>&1`; 
     2164    # - parse the results 
     2165    if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/) 
     2166    { 
     2167      $stats->{'filename'} = $8; 
     2168      $stats->{'replicas'} = $2; 
     2169      $stats->{'filesize'} = $5; 
     2170      $stats->{'modification_date'} = $6; 
     2171      $stats->{'modification_time'} = $7; 
     2172      $stats->{'permissions'} = $1; 
     2173      $stats->{'userid'} = $3; 
     2174      $stats->{'groupid'} = $4; 
     2175    } 
     2176    else 
     2177    { 
     2178      die("Error! Failed to parse HDFS ls result: " . $result . "\n"); 
     2179    } 
     2180  } 
     2181  return $stats; 
     2182} 
     2183# /** file_stats() **/ 
     2184 
     2185# /** 
     2186#  */ 
     2187sub generateHDFSCommand 
     2188{ 
     2189  my $flags = shift(@_); 
     2190  return 'hadoop fs -' . $flags . ' "' . join('" "', @_) . '"'; 
     2191} 
     2192# /** generateHDFSCommand() **/ 
     2193 
     2194# /** @function isHDFS() 
     2195#  *  Determine if the given path exists within a HDFS system by checking for 
     2196#  *  the expected protocol prefix. 
     2197#  *  @param $full_path  the path to check 
     2198#  *  @return 1 if within HDFS, 0 otherwise 
     2199#  */ 
     2200sub isHDFS 
     2201{ 
     2202  my ($full_path) = @_; 
     2203  return (lc(substr($full_path, 0, 7)) eq 'hdfs://'); 
     2204} 
     2205# /** isHDFS() **/ 
    16452206 
    164622071;