root/gs2-extensions/parallel-building/trunk/src/perllib/FileUtils/HDFSShell.pm @ 30354

Revision 30354, 12.3 KB (checked in by jmt12, 5 years ago)

Extending manifest v2 support to allow for directories to be listed in manifest. Matched with changes in Directory plugin to allow paths into systems like HDFS to be listed in manifest.cd

Line 
1###############################################################################
2#
3# HDFSShell.pm -- file functions acting upon a HDFS via the CLI hadoop
4# application
5#
6# A component of the Greenstone digital library software from the New Zealand
7# Digital Library Project at the University of Waikato, New Zealand.
8#
9# Copyright (C) 2013 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify it under
12# the terms of the GNU General Public License as published by the Free Software
13# Foundation; either version 2 of the License, or (at your option) any later
14# version.
15#
16# This program is distributed in the hope that it will be useful, but WITHOUT
17# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
19# details.
20#
21# You should have received a copy of the GNU General Public License along with
22# this program; if not, write to the Free Software Foundation, Inc., 675 Mass
23# Ave, Cambridge, MA 02139, USA.
24#
25###############################################################################
26
27package FileUtils::HDFSShell;
28
29# Pragma
30use strict;
31
32# Configuration
33my $debug = 0;
34
35################################################################################
36######################### Private Functions & Variables ########################
37################################################################################
38
39## @function _executeHDFSCommand()
40#
41# Executes a HDFS command without caring about the resulting output
42# while still reacting appropriately to failed executions.
43#
44sub _executeHDFSCommand
45{
46  my $return_result = shift(@_);
47  if ($return_result != 0 && $return_result != 1)
48  {
49    &FileUtils::printError('Unexpected value for return_result argument - should be 0 or 1: ' . $return_result, 1);
50  }
51  my $command = &_generateHDFSCommand(@_);
52  my $result = `$command 2>&1`;
53  my $return_value = $?;
54  &_printDebug(' -> util::executeHDFSCommand() => |' . $result . '| [' . $return_value . ']');
55  # sometimes we may want the actual resulting output returned, for
56  # instance when parsing ls
57  if ($return_result)
58  {
59    $return_value = $result;
60  }
61  return $return_value;
62}
63## _executeHDFSCommand()
64
65
66## @function _generateHDFSCommand()
67#
68sub _generateHDFSCommand
69{
70  my $action = shift(@_);
71  my @args = @_;
72  my $arguments = '';
73  foreach my $path (@args)
74  {
75    # Replace the prefix with one HDFS Shell understands
76    $path =~ s/HDFSShell:/hdfs:/;
77    # special case for standard streams
78    if ($path eq '-')
79    {
80      $arguments .= '- ';
81    }
82    else
83    {
84      $arguments .= '"' . $path . '" ';
85    }
86  }
87  my $command = 'hadoop fs -' . $action . ' ' . $arguments;
88  &_printDebug(' -> _generateHDFSCommand("' . $action . '", ...) => |' . $command . '|');
89  return $command;
90}
91## _generateHDFSCommand()
92
93
94## @function _printDebug()
95#
96sub _printDebug
97{
98  my ($message) = @_;
99  if ($debug)
100  {
101    print STDERR '[DEBUG] ' . $message . "\n";
102  }
103}
104## _printDebug()
105
106
107################################################################################
108############################### Public Functions ###############################
109################################################################################
110
111
112## @function canRead()
113#
114sub canRead
115{
116  my $path = shift(@_);
117  # On my Hadoop setups it appears everyone can read everything... pretty sure
118  # this won't always be the case but I'm not sure if there is some easy way to
119  # determine readability (you'd need to parse the permissions, user, and group
120  # and then somehow compare to the current user). So instead I'll just return
121  # if the file exists
122  return &fileTest($path, '-f');
123}
124## canRead()
125
126
127## @function closeFileHandle()
128#
129sub closeFileHandle
130{
131  my $fh_ref = shift(@_);
132  close($$fh_ref);
133  return 1;
134}
135## closeFileHandle()
136
137
138## @function fileSize()
139#
140sub fileSize
141{
142  my ($path) = @_;
143  my $file_stats = &fileStats($path);
144  return $file_stats->{'filesize'};
145}
146## fileSize()
147
148
149## @function fileStats()
150#
151sub fileStats
152{
153  my ($path) = @_;
154  my $stats = {};
155  my $result = &_executeHDFSCommand(1, 'ls', $path);
156  # - parse the results
157  if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/)
158  {
159    $stats->{'filename'} = $8;
160    $stats->{'replicas'} = $2;
161    $stats->{'filesize'} = $5;
162    $stats->{'modification_date'} = $6;
163    $stats->{'modification_time'} = $7;
164    $stats->{'permissions'} = $1;
165    $stats->{'userid'} = $3;
166    $stats->{'groupid'} = $4;
167  }
168  else
169  {
170    &FileUtils::printError('Failed to parse -ls result: ' . $result, 1);
171  }
172  return $stats;
173}
174## fileStats()
175
176
177## @function fileTest()
178#
179sub fileTest
180{
181  my ($filename_full_path, $test_op) = @_;
182  # Sanity tests
183  # Special case: HDFS doesn't support symlinking - swap for -e instead
184  if (!defined $test_op || $test_op eq '-l')
185  {
186    $test_op = '-e';
187  }
188  my $retval = -1; # cmd return fails > 0 are errors
189  # Special case: the easiest way to support -f is to run a -e followed by a -d
190  # (which should fail for files)
191  if ($test_op eq '-f')
192  {
193    my $retval1 = &_executeHDFSCommand(0, 'test -e', $filename_full_path);
194    if ($retval1 == 0)
195    {
196      my $retval2 = &_executeHDFSCommand(0, 'test -d', $filename_full_path);
197      if ($retval2 > 0)
198      {
199        $retval = 0;
200      }
201    }
202  }
203  # very limited test op support for HDFS
204  elsif ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z')
205  {
206    &FileUtils::printError('Unknown or unsupported test mode: ' . $test_op);
207  }
208  else
209  {
210    $retval = &_executeHDFSCommand(0, 'test ' . $test_op, $filename_full_path);
211  }
212  return ($retval == 0 ? 1 : 0);
213}
214## fileTest()
215
216
217## @function filenameConcatenate()
218#
219sub filenameConcatenate
220{
221  my $protocol = shift(@_);
222  my $filename = join('/', @_);
223  # remove repeated slashes
224  $filename =~ s/[\/]+/\//g;
225  # append protocol (which may cause multiple slashes)
226  $filename = $protocol . '/' . $filename;
227  # strip any trailing slashes
228  $filename =~ s/[\\\/]$//;
229  return $filename;
230}
231## filenameConcatenate()
232
233
234## @function isFilenameAbsolute()
235#
236sub isFilenameAbsolute
237{
238  # File paths against HDFS must be.
239  return 1;
240}
241# isFilenameAbsolute()
242
243
244## @function isHDFS
245#
246sub isHDFS
247{
248  return 1;
249}
250## isHDFS()
251
252
253## @function isSpecialDirectory
254#
255sub isSpecialDirectory
256{
257    my ($path) = @_;
258    return ($path =~ /^HDFSShell:\/\/[a-zA-Z]+:\d+$/);
259}
260## isSpecialDirectory()
261
262
263## @function makeDirectory()
264#
265sub makeDirectory
266{
267  my ($dir) = @_;
268  my $result = &_executeHDFSCommand(0, 'mkdir', $dir);
269  # HDFSShell mkdir returns 0 on success, -1 on failure
270  return ($result == 0 ? 1 : 0);
271}
272## makeDirectory()
273
274
275## @function modificationTime()
276#
277sub modificationTime
278{
279  my ($path) = @_;
280  &FileUtils::printWarning("modificationTime() not supported");
281  my $file_stats = &fileStats($path);
282  my $mod_date = $file_stats->{'modification_date'};
283  $mod_date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
284  my $mod_year = $1;
285  my $mod_month = $2;
286  my $mod_day = $3;
287  my $mod_time = $file_stats->{'modification_time'};
288  $mod_time =~ /(\d\d):(\d\d)/;
289  my $mod_hour = $1;
290  my $mod_minute = $2;
291  return 0;
292}
293## modificationTime()
294
295
296## @function openFileHandle()
297#
298sub openFileHandle
299{
300  my ($path, $mode, $fh_ref) = @_;
301  if ($mode eq '>>' || $mode eq 'a')
302  {
303    &FileUtils::printError('Append (>>) mode not supported', 1);
304  }
305  elsif ($mode eq '>' || $mode eq 'w')
306  {
307    # the put command fails if the file already exists
308    if (&fileTest($path, '-e'))
309    {
310      &removeFiles($path);
311    }
312    open($$fh_ref, '| ' . &_generateHDFSCommand('put', '-', $path)) or &FileUtils::printError('Failed to open pipe to HDFS (put) for writing: ' . $path, 1);
313  }
314  else
315  {
316    open($$fh_ref, &_generateHDFSCommand('cat', $path) . ' |') or &FileUtils::printError('Failed to open pipe to HDFS (cat) for reading: ' . $path, 1);
317  }
318  return 1;
319}
320## openFileHandle()
321
322
323## @function readDirectory()
324#
325sub readDirectory
326{
327  my ($path) = @_;
328  my @files;
329  my $result = &_executeHDFSCommand(1, 'ls', $path);
330  if ($result =~ /No such file or directory/)
331  {
332      print STDERR "BOOM! BOOM! BOOM!\n";
333      return undef;
334  }
335  my @lines = split(/\r?\n/, $result);
336  foreach my $line (@lines)
337  {
338    if ($line =~ /\/([^\/]+)$/)
339    {
340      my $file = $1;
341      push(@files, $file);
342    }
343  }
344  return \@files;
345}
346## readDirectory()
347
348
349## @function removeFiles()
350#
351sub removeFiles
352{
353  my ($path, $including_dir) = @_;
354  my $result;
355  if (defined $including_dir && $including_dir)
356  {
357    $result = &_executeHDFSCommand(0, 'rmr', $path);
358  }
359  else
360  {
361    $result = &_executeHDFSCommand(0, 'rm', $path);
362  }
363  # HDFSShell mkdir returns 0 on success, -1 on failure
364  return ($result == 0 ? 1 : 0);
365}
366## removeFiles()
367
368
369## @function removeFilesFiltered()
370#
371sub removeFilesFiltered
372{
373  my ($paths, $accept_re, $reject_re) = @_;
374  # Perform a depth first, recursive, removal of files and directories that
375  # match the given accept and reject patterns
376  my @paths_array = (ref $paths eq "ARRAY") ? @$paths : ($paths);
377  my $num_removed = 0;
378  foreach my $path (@paths_array)
379  {
380    # remove trailing slashes
381    $path =~ s/[\/\\]+$//;
382    if (!&fileTest($path, '-e'))
383    {
384      &FileUtils::printError('path does not exist: ' . $path);
385    }
386    elsif (&fileTest($path, '-d'))
387    {
388      my @files = @{&readDirectory($path)};
389      foreach my $file (@files)
390      {
391        my $child_path = $path . '/' . $file;
392        $num_removed += &removeFilesFiltered($child_path, $accept_re, $reject_re);
393      }
394      if (!defined $accept_re && !defined $reject_re)
395      {
396        # remove this directory
397        my $result = &removeFiles($path, 1);
398        if ($result != 1)
399        {
400          &FileUtils::printError('could not remove directory: ' . $path);
401        }
402        else
403        {
404          $num_removed++;
405        }
406      }
407    }
408    else
409    {
410      if (defined $reject_re && ($path =~ m/$reject_re/))
411      {
412        next;
413      }
414      if ((!defined $accept_re) || ($path =~ m/$accept_re/))
415      {
416        # remove this file
417        my $result = &removeFiles($path);
418        if ($result != 1)
419        {
420          &FileUtils::printError('could not remove file: ' . $path);
421        }
422        else
423        {
424          $num_removed++;
425        }
426      }
427    }
428  }
429  return $num_removed;
430}
431## removeFilesFiltered()
432
433
434## @function removeFilesRecursive()
435#
436sub removeFilesRecursive
437{
438  my ($path) = @_;
439  # use the more general removeFilesFiltered() function with no accept
440  # or reject expressions
441  return &removeFilesFiltered($path, undef, undef);
442}
443## removeFilesRecursive()
444
445
446## @function supportsSymbolicLink
447#
448sub supportsSymbolicLink
449{
450  return 0;
451}
452## supportsSymbolicLink()
453
454
455## @function transferFile()
456#
457sub transferFile
458{
459  my ($mode, $src, $dst) = @_;
460  my $result;
461  if ($mode eq 'COPY')
462  {
463    $result = &_executeHDFSCommand(0, 'cp', $src, $dst);
464  }
465  else
466  {
467    $result = &_executeHDFSCommand(0, 'mv', $src, $dst);
468  }
469  # HDFSShell mkdir returns 0 on success, -1 on failure
470  return ($result == 0 ? 1 : 0);
471}
472## transferFile()
473
474
475## @function transferFileFromLocal()
476#
477sub transferFileFromLocal
478{
479  my ($mode, $src, $dst) = @_;
480  if (!-f $src)
481  {
482    &FileUtils::printError('Source file (during ' . $mode . ') doesn\'t exists: ' . $src);
483  }
484  if (&fileTest($dst, '-d'))
485  {
486    my ($filename) = $src =~ /([^\\\/]+)$/;
487    $dst .= '/' . $filename;
488  }
489  if (&fileTest($dst, '-f'))
490  {
491    &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
492  }
493  my $result = &_executeHDFSCommand(0, 'put', $src, $dst);
494  my $remove_result = 1;
495  if ($mode eq 'MOVE')
496  {
497    unlink($src);
498    # failed to delete somehow
499    if (-f $src)
500    {
501      $remove_result = 0;
502    }
503  }
504  return ($result == 0 && $remove_result ? 1 : 0);
505}
506## transferFileFromLocal()
507
508
509## @function transferFileToLocal()
510#
511sub transferFileToLocal
512{
513  my ($mode, $src, $dst) = @_;
514  if (!&fileTest($src, '-f'))
515  {
516    &FileUtils::printError('Source file (during ' . $mode . ') does not exist: ' . $src);
517  }
518  if (-d $dst)
519  {
520    my ($filename) = $src =~ /([^\\\/]+)$/;
521    $dst .= '/' . $filename;
522  }
523  if (-e $dst)
524  {
525    &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
526  }
527  my $result = &_executeHDFSCommand(0, 'get', $src, $dst);
528  my $remove_result = 1;
529  if ($mode eq 'MOVE')
530  {
531    $remove_result = &removeFiles($src);
532  }
533  return ($result == 0 && $remove_result ? 1 : 0);
534}
535## transferFileToLocal()
536
537
5381;
Note: See TracBrowser for help on using the browser.