source: gs2-extensions/parallel-building/trunk/src/perllib/FileUtils/HDFSShell.pm@ 27423

Last change on this file since 27423 was 27423, checked in by jmt12, 11 years ago

Adding canRead() and isAbsolute() functions

File size: 12.3 KB
Line 
1###############################################################################
2#
3# HDFSShell.pm -- file functions acting upon a HDFS via the CLI hadoop
4# application
5#
6# A component of the Greenstone digital library software from the New Zealand
7# Digital Library Project at the University of Waikato, New Zealand.
8#
9# Copyright (C) 2013 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify it under
12# the terms of the GNU General Public License as published by the Free Software
13# Foundation; either version 2 of the License, or (at your option) any later
14# version.
15#
16# This program is distributed in the hope that it will be useful, but WITHOUT
17# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
19# details.
20#
21# You should have received a copy of the GNU General Public License along with
22# this program; if not, write to the Free Software Foundation, Inc., 675 Mass
23# Ave, Cambridge, MA 02139, USA.
24#
25###############################################################################
26
27package FileUtils::HDFSShell;
28
29# Pragma
30use strict;
31
32# Modules
33use DateTime;
34
35# Configuration
36my $debug = 0;
37
38################################################################################
39######################### Private Functions & Variables ########################
40################################################################################
41
42## @function _executeHDFSCommand()
43#
44# Executes a HDFS command without caring about the resulting output
45# while still reacting appropriately to failed executions.
46#
47sub _executeHDFSCommand
48{
49 my $return_result = shift(@_);
50 if ($return_result != 0 && $return_result != 1)
51 {
52 &FileUtils::printError('Unexpected value for return_result argument - should be 0 or 1: ' . $return_result, 1);
53 }
54 my $command = &_generateHDFSCommand(@_);
55 my $result = `$command 2>&1`;
56 my $return_value = $?;
57 &_printDebug(' -> util::executeHDFSCommand() => |' . $result . '| [' . $return_value . ']');
58 # sometimes we may want the actual resulting output returned, for
59 # instance when parsing ls
60 if ($return_result)
61 {
62 $return_value = $result;
63 }
64 return $return_value;
65}
66## _executeHDFSCommand()
67
68
69## @function _generateHDFSCommand()
70#
71sub _generateHDFSCommand
72{
73 my $action = shift(@_);
74 my @args = @_;
75 my $arguments = '';
76 foreach my $path (@args)
77 {
78 # Replace the prefix with one HDFS Shell understands
79 $path =~ s/HDFSShell:/hdfs:/;
80 # special case for standard streams
81 if ($path eq '-')
82 {
83 $arguments .= '- ';
84 }
85 else
86 {
87 $arguments .= '"' . $path . '" ';
88 }
89 }
90 my $command = 'hadoop fs -' . $action . ' ' . $arguments;
91 &_printDebug(' -> _generateHDFSCommand("' . $action . '", ...) => |' . $command . '|');
92 return $command;
93}
94## _generateHDFSCommand()
95
96
97## @function _printDebug()
98#
99sub _printDebug
100{
101 my ($message) = @_;
102 if ($debug)
103 {
104 print STDERR '[DEBUG] ' . $message . "\n";
105 }
106}
107## _printDebug()
108
109
110################################################################################
111############################### Public Functions ###############################
112################################################################################
113
114
115## @function canRead()
116#
117sub canRead
118{
119 my $path = shift(@_);
120 # On my Hadoop setups it appears everyone can read everything... pretty sure
121 # this won't always be the case but I'm not sure if there is some easy way to
122 # determine readability (you'd need to parse the permissions, user, and group
123 # and then somehow compare to the current user). So instead I'll just return
124 # if the file exists
125 return &fileTest($path, '-f');
126}
127## canRead()
128
129
130## @function closeFileHandle()
131#
132sub closeFileHandle
133{
134 my $fh_ref = shift(@_);
135 close($$fh_ref);
136 return 1;
137}
138## closeFileHandle()
139
140
141## @function fileSize()
142#
143sub fileSize
144{
145 my ($path) = @_;
146 my $file_stats = &fileStats($path);
147 return $file_stats->{'filesize'};
148}
149## fileSize()
150
151
152## @function fileStats()
153#
154sub fileStats
155{
156 my ($path) = @_;
157 my $stats = {};
158 my $result = &_executeHDFSCommand(1, 'ls', $path);
159 # - parse the results
160 if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/)
161 {
162 $stats->{'filename'} = $8;
163 $stats->{'replicas'} = $2;
164 $stats->{'filesize'} = $5;
165 $stats->{'modification_date'} = $6;
166 $stats->{'modification_time'} = $7;
167 $stats->{'permissions'} = $1;
168 $stats->{'userid'} = $3;
169 $stats->{'groupid'} = $4;
170 }
171 else
172 {
173 &FileUtils::printError('Failed to parse -ls result: ' . $result, 1);
174 }
175 return $stats;
176}
177## fileStats()
178
179
180## @function fileTest()
181#
182sub fileTest
183{
184 my ($filename_full_path, $test_op) = @_;
185 # Sanity tests
186 # Special case: HDFS doesn't support symlinking - swap for -e instead
187 if (!defined $test_op || $test_op eq '-l')
188 {
189 $test_op = '-e';
190 }
191 my $retval = -1; # cmd return fails > 0 are errors
192 # Special case: the easiest way to support -f is to run a -e followed by a -d
193 # (which should fail for files)
194 if ($test_op eq '-f')
195 {
196 my $retval1 = &_executeHDFSCommand(0, 'test -e', $filename_full_path);
197 if ($retval1 == 0)
198 {
199 my $retval2 = &_executeHDFSCommand(0, 'test -d', $filename_full_path);
200 if ($retval2 > 0)
201 {
202 $retval = 0;
203 }
204 }
205 }
206 # very limited test op support for HDFS
207 elsif ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z')
208 {
209 &FileUtils::printError('Unknown or unsupported test mode: ' . $test_op);
210 }
211 else
212 {
213 $retval = &_executeHDFSCommand(0, 'test ' . $test_op, $filename_full_path);
214 }
215 return ($retval == 0 ? 1 : 0);
216}
217## fileTest()
218
219
220## @function filenameConcatenate()
221#
222sub filenameConcatenate
223{
224 my $protocol = shift(@_);
225 my $filename = join('/', @_);
226 # remove repeated slashes
227 $filename =~ s/[\/]+/\//g;
228 # append protocol (which may cause multiple slashes)
229 $filename = $protocol . '/' . $filename;
230 # strip any trailing slashes
231 $filename =~ s/[\\\/]$//;
232 return $filename;
233}
234## filenameConcatenate()
235
236
237## @function isFilenameAbsolute()
238#
239sub isFilenameAbsolute
240{
241 # File paths against HDFS must be.
242 return 1;
243}
244# isFilenameAbsolute()
245
246
247## @function makeDirectory()
248#
249sub makeDirectory
250{
251 my ($dir) = @_;
252 my $result = &_executeHDFSCommand(0, 'mkdir', $dir);
253 # HDFSShell mkdir returns 0 on success, -1 on failure
254 return ($result == 0 ? 1 : 0);
255}
256## makeDirectory()
257
258
259## @function modificationTime()
260#
261sub modificationTime
262{
263 my ($path) = @_;
264 my $file_stats = &fileStats($path);
265 my $mod_date = $file_stats->{'modification_date'};
266 $mod_date =~ /(\d\d\d\d)-(\d\d)-(\d\d)/;
267 my $mod_year = $1;
268 my $mod_month = $2;
269 my $mod_day = $3;
270 my $mod_time = $file_stats->{'modification_time'};
271 $mod_time =~ /(\d\d):(\d\d)/;
272 my $mod_hour = $1;
273 my $mod_minute = $2;
274 my $mod_datetime = DateTime->new(year => $mod_year,
275 month => $mod_month,
276 day => $mod_day,
277 hour => $mod_hour,
278 minute => $mod_minute,
279 time_zone => 'local');
280 return $mod_datetime->epoch();
281}
282## modificationTime()
283
284
285## @function openFileHandle()
286#
287sub openFileHandle
288{
289 my ($path, $mode, $fh_ref) = @_;
290 if ($mode eq '>>' || $mode eq 'a')
291 {
292 &FileUtils::printError('Append (>>) mode not supported', 1);
293 }
294 elsif ($mode eq '>' || $mode eq 'w')
295 {
296 # the put command fails if the file already exists
297 if (&fileTest($path, '-e'))
298 {
299 &removeFiles($path);
300 }
301 open($$fh_ref, '| ' . &_generateHDFSCommand('put', '-', $path)) or &FileUtils::printError('Failed to open pipe to HDFS (put) for writing: ' . $path, 1);
302 }
303 else
304 {
305 open($$fh_ref, &_generateHDFSCommand('cat', $path) . ' |') or &FileUtils::printError('Failed to open pipe to HDFS (cat) for reading: ' . $path, 1);
306 }
307 return 1;
308}
309## openFileHandle()
310
311
312## @function readDirectory()
313#
314sub readDirectory
315{
316 my ($path) = @_;
317 my @files;
318 my $result = &_executeHDFSCommand(1, 'ls', $path);
319 my @lines = split(/\r?\n/, $result);
320 foreach my $line (@lines)
321 {
322 if ($line =~ /\/([^\/]+)$/)
323 {
324 my $file = $1;
325 push(@files, $file);
326 }
327 }
328 return \@files;
329}
330## readDirectory()
331
332
333## @function removeFiles()
334#
335sub removeFiles
336{
337 my ($path, $including_dir) = @_;
338 my $result;
339 if (defined $including_dir && $including_dir)
340 {
341 $result = &_executeHDFSCommand(0, 'rmr', $path);
342 }
343 else
344 {
345 $result = &_executeHDFSCommand(0, 'rm', $path);
346 }
347 # HDFSShell mkdir returns 0 on success, -1 on failure
348 return ($result == 0 ? 1 : 0);
349}
350## removeFiles()
351
352
353## @function removeFilesFiltered()
354#
355sub removeFilesFiltered
356{
357 my ($paths, $accept_re, $reject_re) = @_;
358 # Perform a depth first, recursive, removal of files and directories that
359 # match the given accept and reject patterns
360 my @paths_array = (ref $paths eq "ARRAY") ? @$paths : ($paths);
361 my $num_removed = 0;
362 foreach my $path (@paths_array)
363 {
364 # remove trailing slashes
365 $path =~ s/[\/\\]+$//;
366 if (!&fileTest($path, '-e'))
367 {
368 &FileUtils::printError('path does not exist: ' . $path);
369 }
370 elsif (&fileTest($path, '-d'))
371 {
372 my @files = @{&readDirectory($path)};
373 foreach my $file (@files)
374 {
375 my $child_path = $path . '/' . $file;
376 $num_removed += &removeFilesFiltered($child_path, $accept_re, $reject_re);
377 }
378 if (!defined $accept_re && !defined $reject_re)
379 {
380 # remove this directory
381 my $result = &removeFiles($path, 1);
382 if ($result != 1)
383 {
384 &FileUtils::printError('could not remove directory: ' . $path);
385 }
386 else
387 {
388 $num_removed++;
389 }
390 }
391 }
392 else
393 {
394 if (defined $reject_re && ($path =~ m/$reject_re/))
395 {
396 next;
397 }
398 if ((!defined $accept_re) || ($path =~ m/$accept_re/))
399 {
400 # remove this file
401 my $result = &removeFiles($path);
402 if ($result != 1)
403 {
404 &FileUtils::printError('could not remove file: ' . $path);
405 }
406 else
407 {
408 $num_removed++;
409 }
410 }
411 }
412 }
413 return $num_removed;
414}
415## removeFilesFiltered()
416
417
418## @function removeFilesRecursive()
419#
420sub removeFilesRecursive
421{
422 my ($path) = @_;
423 # use the more general removeFilesFiltered() function with no accept
424 # or reject expressions
425 return &removeFilesFiltered($path, undef, undef);
426}
427## removeFilesRecursive()
428
429
430## @function supportsSymbolicLink
431#
432sub supportsSymbolicLink
433{
434 return 0;
435}
436## supportsSymbolicLink()
437
438
439## @function transferFile()
440#
441sub transferFile
442{
443 my ($mode, $src, $dst) = @_;
444 my $result;
445 if ($mode eq 'COPY')
446 {
447 $result = &_executeHDFSCommand(0, 'cp', $src, $dst);
448 }
449 else
450 {
451 $result = &_executeHDFSCommand(0, 'mv', $src, $dst);
452 }
453 # HDFSShell mkdir returns 0 on success, -1 on failure
454 return ($result == 0 ? 1 : 0);
455}
456## transferFile()
457
458
459## @function transferFileFromLocal()
460#
461sub transferFileFromLocal
462{
463 my ($mode, $src, $dst) = @_;
464 if (!-f $src)
465 {
466 &FileUtils::printError('Source file (during ' . $mode . ') doesn\'t exists: ' . $src);
467 }
468 if (&fileTest($dst, '-d'))
469 {
470 my ($filename) = $src =~ /([^\\\/]+)$/;
471 $dst .= '/' . $filename;
472 }
473 if (&fileTest($dst, '-f'))
474 {
475 &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
476 }
477 my $result = &_executeHDFSCommand(0, 'put', $src, $dst);
478 my $remove_result = 1;
479 if ($mode eq 'MOVE')
480 {
481 unlink($src);
482 # failed to delete somehow
483 if (-f $src)
484 {
485 $remove_result = 0;
486 }
487 }
488 return ($result == 0 && $remove_result ? 1 : 0);
489}
490## transferFileFromLocal()
491
492
493## @function transferFileToLocal()
494#
495sub transferFileToLocal
496{
497 my ($mode, $src, $dst) = @_;
498 if (!&fileTest($src, '-f'))
499 {
500 &FileUtils::printError('Source file (during ' . $mode . ') does not exist: ' . $src);
501 }
502 if (-d $dst)
503 {
504 my ($filename) = $src =~ /([^\\\/]+)$/;
505 $dst .= '/' . $filename;
506 }
507 if (-e $dst)
508 {
509 &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
510 }
511 my $result = &_executeHDFSCommand(0, 'get', $src, $dst);
512 my $remove_result = 1;
513 if ($mode eq 'MOVE')
514 {
515 $remove_result = &removeFiles($src);
516 }
517 return ($result == 0 && $remove_result ? 1 : 0);
518}
519## transferFileToLocal()
520
521
5221;
Note: See TracBrowser for help on using the repository browser.