source: gs2-extensions/parallel-building/trunk/src/perllib/FileUtils/HDFSShell.pm@ 27479

Last change on this file since 27479 was 27479, checked in by jmt12, 11 years ago

Remove time parsing as DateTime is a fricking nightmare to install without the CPAN module manager

File size: 12.0 KB
Line 
1###############################################################################
2#
3# HDFSShell.pm -- file functions acting upon a HDFS via the CLI hadoop
4# application
5#
6# A component of the Greenstone digital library software from the New Zealand
7# Digital Library Project at the University of Waikato, New Zealand.
8#
9# Copyright (C) 2013 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify it under
12# the terms of the GNU General Public License as published by the Free Software
13# Foundation; either version 2 of the License, or (at your option) any later
14# version.
15#
16# This program is distributed in the hope that it will be useful, but WITHOUT
17# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19# details.
20#
21# You should have received a copy of the GNU General Public License along with
22# this program; if not, write to the Free Software Foundation, Inc., 675 Mass
23# Ave, Cambridge, MA 02139, USA.
24#
25###############################################################################
26
27package FileUtils::HDFSShell;
28
29# Pragma
30use strict;
31
32# Configuration
33my $debug = 0;
34
35################################################################################
36######################### Private Functions & Variables ########################
37################################################################################
38
39## @function _executeHDFSCommand()
40#
41# Executes a HDFS command without caring about the resulting output
42# while still reacting appropriately to failed executions.
43#
44sub _executeHDFSCommand
45{
46 my $return_result = shift(@_);
47 if ($return_result != 0 && $return_result != 1)
48 {
49 &FileUtils::printError('Unexpected value for return_result argument - should be 0 or 1: ' . $return_result, 1);
50 }
51 my $command = &_generateHDFSCommand(@_);
52 my $result = `$command 2>&1`;
53 my $return_value = $?;
54 &_printDebug(' -> util::executeHDFSCommand() => |' . $result . '| [' . $return_value . ']');
55 # sometimes we may want the actual resulting output returned, for
56 # instance when parsing ls
57 if ($return_result)
58 {
59 $return_value = $result;
60 }
61 return $return_value;
62}
63## _executeHDFSCommand()
64
65
66## @function _generateHDFSCommand()
67#
68sub _generateHDFSCommand
69{
70 my $action = shift(@_);
71 my @args = @_;
72 my $arguments = '';
73 foreach my $path (@args)
74 {
75 # Replace the prefix with one HDFS Shell understands
76 $path =~ s/HDFSShell:/hdfs:/;
77 # special case for standard streams
78 if ($path eq '-')
79 {
80 $arguments .= '- ';
81 }
82 else
83 {
84 $arguments .= '"' . $path . '" ';
85 }
86 }
87 my $command = 'hadoop fs -' . $action . ' ' . $arguments;
88 &_printDebug(' -> _generateHDFSCommand("' . $action . '", ...) => |' . $command . '|');
89 return $command;
90}
91## _generateHDFSCommand()
92
93
94## @function _printDebug()
95#
96sub _printDebug
97{
98 my ($message) = @_;
99 if ($debug)
100 {
101 print STDERR '[DEBUG] ' . $message . "\n";
102 }
103}
104## _printDebug()
105
106
107################################################################################
108############################### Public Functions ###############################
109################################################################################
110
111
112## @function canRead()
113#
114sub canRead
115{
116 my $path = shift(@_);
117 # On my Hadoop setups it appears everyone can read everything... pretty sure
118 # this won't always be the case but I'm not sure if there is some easy way to
119 # determine readability (you'd need to parse the permissions, user, and group
120 # and then somehow compare to the current user). So instead I'll just return
121 # if the file exists
122 return &fileTest($path, '-f');
123}
124## canRead()
125
126
127## @function closeFileHandle()
128#
129sub closeFileHandle
130{
131 my $fh_ref = shift(@_);
132 close($$fh_ref);
133 return 1;
134}
135## closeFileHandle()
136
137
138## @function fileSize()
139#
140sub fileSize
141{
142 my ($path) = @_;
143 my $file_stats = &fileStats($path);
144 return $file_stats->{'filesize'};
145}
146## fileSize()
147
148
149## @function fileStats()
150#
151sub fileStats
152{
153 my ($path) = @_;
154 my $stats = {};
155 my $result = &_executeHDFSCommand(1, 'ls', $path);
156 # - parse the results
157 if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/)
158 {
159 $stats->{'filename'} = $8;
160 $stats->{'replicas'} = $2;
161 $stats->{'filesize'} = $5;
162 $stats->{'modification_date'} = $6;
163 $stats->{'modification_time'} = $7;
164 $stats->{'permissions'} = $1;
165 $stats->{'userid'} = $3;
166 $stats->{'groupid'} = $4;
167 }
168 else
169 {
170 &FileUtils::printError('Failed to parse -ls result: ' . $result, 1);
171 }
172 return $stats;
173}
174## fileStats()
175
176
177## @function fileTest()
178#
179sub fileTest
180{
181 my ($filename_full_path, $test_op) = @_;
182 # Sanity tests
183 # Special case: HDFS doesn't support symlinking - swap for -e instead
184 if (!defined $test_op || $test_op eq '-l')
185 {
186 $test_op = '-e';
187 }
188 my $retval = -1; # cmd return fails > 0 are errors
189 # Special case: the easiest way to support -f is to run a -e followed by a -d
190 # (which should fail for files)
191 if ($test_op eq '-f')
192 {
193 my $retval1 = &_executeHDFSCommand(0, 'test -e', $filename_full_path);
194 if ($retval1 == 0)
195 {
196 my $retval2 = &_executeHDFSCommand(0, 'test -d', $filename_full_path);
197 if ($retval2 > 0)
198 {
199 $retval = 0;
200 }
201 }
202 }
203 # very limited test op support for HDFS
204 elsif ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z')
205 {
206 &FileUtils::printError('Unknown or unsupported test mode: ' . $test_op);
207 }
208 else
209 {
210 $retval = &_executeHDFSCommand(0, 'test ' . $test_op, $filename_full_path);
211 }
212 return ($retval == 0 ? 1 : 0);
213}
214## fileTest()
215
216
217## @function filenameConcatenate()
218#
219sub filenameConcatenate
220{
221 my $protocol = shift(@_);
222 my $filename = join('/', @_);
223 # remove repeated slashes
224 $filename =~ s/[\/]+/\//g;
225 # append protocol (which may cause multiple slashes)
226 $filename = $protocol . '/' . $filename;
227 # strip any trailing slashes
228 $filename =~ s/[\\\/]$//;
229 return $filename;
230}
231## filenameConcatenate()
232
233
234## @function isFilenameAbsolute()
235#
236sub isFilenameAbsolute
237{
238 # File paths against HDFS must be.
239 return 1;
240}
241# isFilenameAbsolute()
242
243
244## @function makeDirectory()
245#
246sub makeDirectory
247{
248 my ($dir) = @_;
249 my $result = &_executeHDFSCommand(0, 'mkdir', $dir);
250 # HDFSShell mkdir returns 0 on success, -1 on failure
251 return ($result == 0 ? 1 : 0);
252}
253## makeDirectory()
254
255
256## @function modificationTime()
257#
258sub modificationTime
259{
260 my ($path) = @_;
261 &FileUtils::printWarning("modificationTime() not supported");
262 my $file_stats = &fileStats($path);
263 my $mod_date = $file_stats->{'modification_date'};
264 $mod_date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
265 my $mod_year = $1;
266 my $mod_month = $2;
267 my $mod_day = $3;
268 my $mod_time = $file_stats->{'modification_time'};
269 $mod_time =~ /(\d\d):(\d\d)/;
270 my $mod_hour = $1;
271 my $mod_minute = $2;
272 return 0;
273}
274## modificationTime()
275
276
277## @function openFileHandle()
278#
279sub openFileHandle
280{
281 my ($path, $mode, $fh_ref) = @_;
282 if ($mode eq '>>' || $mode eq 'a')
283 {
284 &FileUtils::printError('Append (>>) mode not supported', 1);
285 }
286 elsif ($mode eq '>' || $mode eq 'w')
287 {
288 # the put command fails if the file already exists
289 if (&fileTest($path, '-e'))
290 {
291 &removeFiles($path);
292 }
293 open($$fh_ref, '| ' . &_generateHDFSCommand('put', '-', $path)) or &FileUtils::printError('Failed to open pipe to HDFS (put) for writing: ' . $path, 1);
294 }
295 else
296 {
297 open($$fh_ref, &_generateHDFSCommand('cat', $path) . ' |') or &FileUtils::printError('Failed to open pipe to HDFS (cat) for reading: ' . $path, 1);
298 }
299 return 1;
300}
301## openFileHandle()
302
303
304## @function readDirectory()
305#
306sub readDirectory
307{
308 my ($path) = @_;
309 my @files;
310 my $result = &_executeHDFSCommand(1, 'ls', $path);
311 my @lines = split(/\r?\n/, $result);
312 foreach my $line (@lines)
313 {
314 if ($line =~ /\/([^\/]+)$/)
315 {
316 my $file = $1;
317 push(@files, $file);
318 }
319 }
320 return \@files;
321}
322## readDirectory()
323
324
325## @function removeFiles()
326#
327sub removeFiles
328{
329 my ($path, $including_dir) = @_;
330 my $result;
331 if (defined $including_dir && $including_dir)
332 {
333 $result = &_executeHDFSCommand(0, 'rmr', $path);
334 }
335 else
336 {
337 $result = &_executeHDFSCommand(0, 'rm', $path);
338 }
339 # HDFSShell mkdir returns 0 on success, -1 on failure
340 return ($result == 0 ? 1 : 0);
341}
342## removeFiles()
343
344
345## @function removeFilesFiltered()
346#
347sub removeFilesFiltered
348{
349 my ($paths, $accept_re, $reject_re) = @_;
350 # Perform a depth first, recursive, removal of files and directories that
351 # match the given accept and reject patterns
352 my @paths_array = (ref $paths eq "ARRAY") ? @$paths : ($paths);
353 my $num_removed = 0;
354 foreach my $path (@paths_array)
355 {
356 # remove trailing slashes
357 $path =~ s/[\/\\]+$//;
358 if (!&fileTest($path, '-e'))
359 {
360 &FileUtils::printError('path does not exist: ' . $path);
361 }
362 elsif (&fileTest($path, '-d'))
363 {
364 my @files = @{&readDirectory($path)};
365 foreach my $file (@files)
366 {
367 my $child_path = $path . '/' . $file;
368 $num_removed += &removeFilesFiltered($child_path, $accept_re, $reject_re);
369 }
370 if (!defined $accept_re && !defined $reject_re)
371 {
372 # remove this directory
373 my $result = &removeFiles($path, 1);
374 if ($result != 1)
375 {
376 &FileUtils::printError('could not remove directory: ' . $path);
377 }
378 else
379 {
380 $num_removed++;
381 }
382 }
383 }
384 else
385 {
386 if (defined $reject_re && ($path =~ m/$reject_re/))
387 {
388 next;
389 }
390 if ((!defined $accept_re) || ($path =~ m/$accept_re/))
391 {
392 # remove this file
393 my $result = &removeFiles($path);
394 if ($result != 1)
395 {
396 &FileUtils::printError('could not remove file: ' . $path);
397 }
398 else
399 {
400 $num_removed++;
401 }
402 }
403 }
404 }
405 return $num_removed;
406}
407## removeFilesFiltered()
408
409
410## @function removeFilesRecursive()
411#
412sub removeFilesRecursive
413{
414 my ($path) = @_;
415 # use the more general removeFilesFiltered() function with no accept
416 # or reject expressions
417 return &removeFilesFiltered($path, undef, undef);
418}
419## removeFilesRecursive()
420
421
422## @function supportsSymbolicLink
423#
424sub supportsSymbolicLink
425{
426 return 0;
427}
428## supportsSymbolicLink()
429
430
431## @function transferFile()
432#
433sub transferFile
434{
435 my ($mode, $src, $dst) = @_;
436 my $result;
437 if ($mode eq 'COPY')
438 {
439 $result = &_executeHDFSCommand(0, 'cp', $src, $dst);
440 }
441 else
442 {
443 $result = &_executeHDFSCommand(0, 'mv', $src, $dst);
444 }
445 # HDFSShell mkdir returns 0 on success, -1 on failure
446 return ($result == 0 ? 1 : 0);
447}
448## transferFile()
449
450
451## @function transferFileFromLocal()
452#
453sub transferFileFromLocal
454{
455 my ($mode, $src, $dst) = @_;
456 if (!-f $src)
457 {
458 &FileUtils::printError('Source file (during ' . $mode . ') doesn\'t exists: ' . $src);
459 }
460 if (&fileTest($dst, '-d'))
461 {
462 my ($filename) = $src =~ /([^\\\/]+)$/;
463 $dst .= '/' . $filename;
464 }
465 if (&fileTest($dst, '-f'))
466 {
467 &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
468 }
469 my $result = &_executeHDFSCommand(0, 'put', $src, $dst);
470 my $remove_result = 1;
471 if ($mode eq 'MOVE')
472 {
473 unlink($src);
474 # failed to delete somehow
475 if (-f $src)
476 {
477 $remove_result = 0;
478 }
479 }
480 return ($result == 0 && $remove_result ? 1 : 0);
481}
482## transferFileFromLocal()
483
484
485## @function transferFileToLocal()
486#
487sub transferFileToLocal
488{
489 my ($mode, $src, $dst) = @_;
490 if (!&fileTest($src, '-f'))
491 {
492 &FileUtils::printError('Source file (during ' . $mode . ') does not exist: ' . $src);
493 }
494 if (-d $dst)
495 {
496 my ($filename) = $src =~ /([^\\\/]+)$/;
497 $dst .= '/' . $filename;
498 }
499 if (-e $dst)
500 {
501 &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
502 }
503 my $result = &_executeHDFSCommand(0, 'get', $src, $dst);
504 my $remove_result = 1;
505 if ($mode eq 'MOVE')
506 {
507 $remove_result = &removeFiles($src);
508 }
509 return ($result == 0 && $remove_result ? 1 : 0);
510}
511## transferFileToLocal()
512
513
5141;
Note: See TracBrowser for help on using the repository browser.