source: gs2-extensions/parallel-building/trunk/src/perllib/FileUtils/HDFSShell.pm@ 30354

Last change on this file since 30354 was 30354, checked in by jmt12, 8 years ago

Extending manifest v2 support to allow for directories to be listed in manifest. Matched with changes in Directory plugin to allow paths into systems like HDFS to be listed in manifest.cd

File size: 12.3 KB
Line 
1###############################################################################
2#
3# HDFSShell.pm -- file functions acting upon a HDFS via the CLI hadoop
4# application
5#
6# A component of the Greenstone digital library software from the New Zealand
7# Digital Library Project at the University of Waikato, New Zealand.
8#
9# Copyright (C) 2013 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify it under
12# the terms of the GNU General Public License as published by the Free Software
13# Foundation; either version 2 of the License, or (at your option) any later
14# version.
15#
16# This program is distributed in the hope that it will be useful, but WITHOUT
17# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19# details.
20#
21# You should have received a copy of the GNU General Public License along with
22# this program; if not, write to the Free Software Foundation, Inc., 675 Mass
23# Ave, Cambridge, MA 02139, USA.
24#
25###############################################################################
26
27package FileUtils::HDFSShell;
28
29# Pragma
30use strict;
31
32# Configuration
33my $debug = 0;
34
35################################################################################
36######################### Private Functions & Variables ########################
37################################################################################
38
39## @function _executeHDFSCommand()
40#
41# Executes a HDFS command without caring about the resulting output
42# while still reacting appropriately to failed executions.
43#
44sub _executeHDFSCommand
45{
46 my $return_result = shift(@_);
47 if ($return_result != 0 && $return_result != 1)
48 {
49 &FileUtils::printError('Unexpected value for return_result argument - should be 0 or 1: ' . $return_result, 1);
50 }
51 my $command = &_generateHDFSCommand(@_);
52 my $result = `$command 2>&1`;
53 my $return_value = $?;
54 &_printDebug(' -> util::executeHDFSCommand() => |' . $result . '| [' . $return_value . ']');
55 # sometimes we may want the actual resulting output returned, for
56 # instance when parsing ls
57 if ($return_result)
58 {
59 $return_value = $result;
60 }
61 return $return_value;
62}
63## _executeHDFSCommand()
64
65
66## @function _generateHDFSCommand()
67#
68sub _generateHDFSCommand
69{
70 my $action = shift(@_);
71 my @args = @_;
72 my $arguments = '';
73 foreach my $path (@args)
74 {
75 # Replace the prefix with one HDFS Shell understands
76 $path =~ s/HDFSShell:/hdfs:/;
77 # special case for standard streams
78 if ($path eq '-')
79 {
80 $arguments .= '- ';
81 }
82 else
83 {
84 $arguments .= '"' . $path . '" ';
85 }
86 }
87 my $command = 'hadoop fs -' . $action . ' ' . $arguments;
88 &_printDebug(' -> _generateHDFSCommand("' . $action . '", ...) => |' . $command . '|');
89 return $command;
90}
91## _generateHDFSCommand()
92
93
94## @function _printDebug()
95#
96sub _printDebug
97{
98 my ($message) = @_;
99 if ($debug)
100 {
101 print STDERR '[DEBUG] ' . $message . "\n";
102 }
103}
104## _printDebug()
105
106
107################################################################################
108############################### Public Functions ###############################
109################################################################################
110
111
112## @function canRead()
113#
114sub canRead
115{
116 my $path = shift(@_);
117 # On my Hadoop setups it appears everyone can read everything... pretty sure
118 # this won't always be the case but I'm not sure if there is some easy way to
119 # determine readability (you'd need to parse the permissions, user, and group
120 # and then somehow compare to the current user). So instead I'll just return
121 # if the file exists
122 return &fileTest($path, '-f');
123}
124## canRead()
125
126
127## @function closeFileHandle()
128#
129sub closeFileHandle
130{
131 my $fh_ref = shift(@_);
132 close($$fh_ref);
133 return 1;
134}
135## closeFileHandle()
136
137
138## @function fileSize()
139#
140sub fileSize
141{
142 my ($path) = @_;
143 my $file_stats = &fileStats($path);
144 return $file_stats->{'filesize'};
145}
146## fileSize()
147
148
149## @function fileStats()
150#
151sub fileStats
152{
153 my ($path) = @_;
154 my $stats = {};
155 my $result = &_executeHDFSCommand(1, 'ls', $path);
156 # - parse the results
157 if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/)
158 {
159 $stats->{'filename'} = $8;
160 $stats->{'replicas'} = $2;
161 $stats->{'filesize'} = $5;
162 $stats->{'modification_date'} = $6;
163 $stats->{'modification_time'} = $7;
164 $stats->{'permissions'} = $1;
165 $stats->{'userid'} = $3;
166 $stats->{'groupid'} = $4;
167 }
168 else
169 {
170 &FileUtils::printError('Failed to parse -ls result: ' . $result, 1);
171 }
172 return $stats;
173}
174## fileStats()
175
176
177## @function fileTest()
178#
179sub fileTest
180{
181 my ($filename_full_path, $test_op) = @_;
182 # Sanity tests
183 # Special case: HDFS doesn't support symlinking - swap for -e instead
184 if (!defined $test_op || $test_op eq '-l')
185 {
186 $test_op = '-e';
187 }
188 my $retval = -1; # cmd return fails > 0 are errors
189 # Special case: the easiest way to support -f is to run a -e followed by a -d
190 # (which should fail for files)
191 if ($test_op eq '-f')
192 {
193 my $retval1 = &_executeHDFSCommand(0, 'test -e', $filename_full_path);
194 if ($retval1 == 0)
195 {
196 my $retval2 = &_executeHDFSCommand(0, 'test -d', $filename_full_path);
197 if ($retval2 > 0)
198 {
199 $retval = 0;
200 }
201 }
202 }
203 # very limited test op support for HDFS
204 elsif ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z')
205 {
206 &FileUtils::printError('Unknown or unsupported test mode: ' . $test_op);
207 }
208 else
209 {
210 $retval = &_executeHDFSCommand(0, 'test ' . $test_op, $filename_full_path);
211 }
212 return ($retval == 0 ? 1 : 0);
213}
214## fileTest()
215
216
217## @function filenameConcatenate()
218#
219sub filenameConcatenate
220{
221 my $protocol = shift(@_);
222 my $filename = join('/', @_);
223 # remove repeated slashes
224 $filename =~ s/[\/]+/\//g;
225 # append protocol (which may cause multiple slashes)
226 $filename = $protocol . '/' . $filename;
227 # strip any trailing slashes
228 $filename =~ s/[\\\/]$//;
229 return $filename;
230}
231## filenameConcatenate()
232
233
234## @function isFilenameAbsolute()
235#
236sub isFilenameAbsolute
237{
238 # File paths against HDFS must be.
239 return 1;
240}
241# isFilenameAbsolute()
242
243
244## @function isHDFS
245#
246sub isHDFS
247{
248 return 1;
249}
250## isHDFS()
251
252
253## @function isSpecialDirectory
254#
255sub isSpecialDirectory
256{
257 my ($path) = @_;
258 return ($path =~ /^HDFSShell:\/\/[a-zA-Z]+:\d+$/);
259}
260## isSpecialDirectory()
261
262
263## @function makeDirectory()
264#
265sub makeDirectory
266{
267 my ($dir) = @_;
268 my $result = &_executeHDFSCommand(0, 'mkdir', $dir);
269 # HDFSShell mkdir returns 0 on success, -1 on failure
270 return ($result == 0 ? 1 : 0);
271}
272## makeDirectory()
273
274
275## @function modificationTime()
276#
277sub modificationTime
278{
279 my ($path) = @_;
280 &FileUtils::printWarning("modificationTime() not supported");
281 my $file_stats = &fileStats($path);
282 my $mod_date = $file_stats->{'modification_date'};
283 $mod_date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
284 my $mod_year = $1;
285 my $mod_month = $2;
286 my $mod_day = $3;
287 my $mod_time = $file_stats->{'modification_time'};
288 $mod_time =~ /(\d\d):(\d\d)/;
289 my $mod_hour = $1;
290 my $mod_minute = $2;
291 return 0;
292}
293## modificationTime()
294
295
296## @function openFileHandle()
297#
298sub openFileHandle
299{
300 my ($path, $mode, $fh_ref) = @_;
301 if ($mode eq '>>' || $mode eq 'a')
302 {
303 &FileUtils::printError('Append (>>) mode not supported', 1);
304 }
305 elsif ($mode eq '>' || $mode eq 'w')
306 {
307 # the put command fails if the file already exists
308 if (&fileTest($path, '-e'))
309 {
310 &removeFiles($path);
311 }
312 open($$fh_ref, '| ' . &_generateHDFSCommand('put', '-', $path)) or &FileUtils::printError('Failed to open pipe to HDFS (put) for writing: ' . $path, 1);
313 }
314 else
315 {
316 open($$fh_ref, &_generateHDFSCommand('cat', $path) . ' |') or &FileUtils::printError('Failed to open pipe to HDFS (cat) for reading: ' . $path, 1);
317 }
318 return 1;
319}
320## openFileHandle()
321
322
323## @function readDirectory()
324#
325sub readDirectory
326{
327 my ($path) = @_;
328 my @files;
329 my $result = &_executeHDFSCommand(1, 'ls', $path);
330 if ($result =~ /No such file or directory/)
331 {
332 print STDERR "BOOM! BOOM! BOOM!\n";
333 return undef;
334 }
335 my @lines = split(/\r?\n/, $result);
336 foreach my $line (@lines)
337 {
338 if ($line =~ /\/([^\/]+)$/)
339 {
340 my $file = $1;
341 push(@files, $file);
342 }
343 }
344 return \@files;
345}
346## readDirectory()
347
348
349## @function removeFiles()
350#
351sub removeFiles
352{
353 my ($path, $including_dir) = @_;
354 my $result;
355 if (defined $including_dir && $including_dir)
356 {
357 $result = &_executeHDFSCommand(0, 'rmr', $path);
358 }
359 else
360 {
361 $result = &_executeHDFSCommand(0, 'rm', $path);
362 }
363 # HDFSShell mkdir returns 0 on success, -1 on failure
364 return ($result == 0 ? 1 : 0);
365}
366## removeFiles()
367
368
369## @function removeFilesFiltered()
370#
371sub removeFilesFiltered
372{
373 my ($paths, $accept_re, $reject_re) = @_;
374 # Perform a depth first, recursive, removal of files and directories that
375 # match the given accept and reject patterns
376 my @paths_array = (ref $paths eq "ARRAY") ? @$paths : ($paths);
377 my $num_removed = 0;
378 foreach my $path (@paths_array)
379 {
380 # remove trailing slashes
381 $path =~ s/[\/\\]+$//;
382 if (!&fileTest($path, '-e'))
383 {
384 &FileUtils::printError('path does not exist: ' . $path);
385 }
386 elsif (&fileTest($path, '-d'))
387 {
388 my @files = @{&readDirectory($path)};
389 foreach my $file (@files)
390 {
391 my $child_path = $path . '/' . $file;
392 $num_removed += &removeFilesFiltered($child_path, $accept_re, $reject_re);
393 }
394 if (!defined $accept_re && !defined $reject_re)
395 {
396 # remove this directory
397 my $result = &removeFiles($path, 1);
398 if ($result != 1)
399 {
400 &FileUtils::printError('could not remove directory: ' . $path);
401 }
402 else
403 {
404 $num_removed++;
405 }
406 }
407 }
408 else
409 {
410 if (defined $reject_re && ($path =~ m/$reject_re/))
411 {
412 next;
413 }
414 if ((!defined $accept_re) || ($path =~ m/$accept_re/))
415 {
416 # remove this file
417 my $result = &removeFiles($path);
418 if ($result != 1)
419 {
420 &FileUtils::printError('could not remove file: ' . $path);
421 }
422 else
423 {
424 $num_removed++;
425 }
426 }
427 }
428 }
429 return $num_removed;
430}
431## removeFilesFiltered()
432
433
434## @function removeFilesRecursive()
435#
436sub removeFilesRecursive
437{
438 my ($path) = @_;
439 # use the more general removeFilesFiltered() function with no accept
440 # or reject expressions
441 return &removeFilesFiltered($path, undef, undef);
442}
443## removeFilesRecursive()
444
445
446## @function supportsSymbolicLink
447#
448sub supportsSymbolicLink
449{
450 return 0;
451}
452## supportsSymbolicLink()
453
454
455## @function transferFile()
456#
457sub transferFile
458{
459 my ($mode, $src, $dst) = @_;
460 my $result;
461 if ($mode eq 'COPY')
462 {
463 $result = &_executeHDFSCommand(0, 'cp', $src, $dst);
464 }
465 else
466 {
467 $result = &_executeHDFSCommand(0, 'mv', $src, $dst);
468 }
469 # HDFSShell mkdir returns 0 on success, -1 on failure
470 return ($result == 0 ? 1 : 0);
471}
472## transferFile()
473
474
475## @function transferFileFromLocal()
476#
477sub transferFileFromLocal
478{
479 my ($mode, $src, $dst) = @_;
480 if (!-f $src)
481 {
482 &FileUtils::printError('Source file (during ' . $mode . ') doesn\'t exists: ' . $src);
483 }
484 if (&fileTest($dst, '-d'))
485 {
486 my ($filename) = $src =~ /([^\\\/]+)$/;
487 $dst .= '/' . $filename;
488 }
489 if (&fileTest($dst, '-f'))
490 {
491 &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
492 }
493 my $result = &_executeHDFSCommand(0, 'put', $src, $dst);
494 my $remove_result = 1;
495 if ($mode eq 'MOVE')
496 {
497 unlink($src);
498 # failed to delete somehow
499 if (-f $src)
500 {
501 $remove_result = 0;
502 }
503 }
504 return ($result == 0 && $remove_result ? 1 : 0);
505}
506## transferFileFromLocal()
507
508
509## @function transferFileToLocal()
510#
511sub transferFileToLocal
512{
513 my ($mode, $src, $dst) = @_;
514 if (!&fileTest($src, '-f'))
515 {
516 &FileUtils::printError('Source file (during ' . $mode . ') does not exist: ' . $src);
517 }
518 if (-d $dst)
519 {
520 my ($filename) = $src =~ /([^\\\/]+)$/;
521 $dst .= '/' . $filename;
522 }
523 if (-e $dst)
524 {
525 &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
526 }
527 my $result = &_executeHDFSCommand(0, 'get', $src, $dst);
528 my $remove_result = 1;
529 if ($mode eq 'MOVE')
530 {
531 $remove_result = &removeFiles($src);
532 }
533 return ($result == 0 && $remove_result ? 1 : 0);
534}
535## transferFileToLocal()
536
537
5381;
Note: See TracBrowser for help on using the repository browser.