[27386] | 1 | ################################################################################
|
---|
| 2 | #
|
---|
| 3 | # HDThriftFS.pm -- file functions acting upon a HDFS via thrift
|
---|
| 4 | #
|
---|
| 5 | # A component of the Greenstone digital library software from the New Zealand
|
---|
| 6 | # Digital Library Project at the University of Waikato, New Zealand.
|
---|
| 7 | #
|
---|
| 8 | # Copyright (C) 2013 New Zealand Digital Library Project
|
---|
| 9 | #
|
---|
| 10 | # This program is free software; you can redistribute it and/or modify it under
|
---|
| 11 | # the terms of the GNU General Public License as published by the Free Software
|
---|
| 12 | # Foundation; either version 2 of the License, or (at your option) any later
|
---|
| 13 | # version.
|
---|
| 14 | #
|
---|
| 15 | # This program is distributed in the hope that it will be useful, but WITHOUT
|
---|
| 16 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
---|
| 17 | # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
---|
| 18 | # details.
|
---|
| 19 | #
|
---|
| 20 | # You should have received a copy of the GNU General Public License along with
|
---|
| 21 | # this program; if not, write to the Free Software Foundation, Inc., 675 Mass
|
---|
| 22 | # Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | #
|
---|
| 24 | ###############################################################################
|
---|
| 25 |
|
---|
| 26 | # Thrift acts as client-server 'relay' between the Perl code and the HDFS. It
|
---|
| 27 | # allows for persistant connections and so is significantly faster than
|
---|
| 28 | # repeatedly starting Hadoop's Java application over and over. In order to
|
---|
| 29 | # connect to the Thrift server this code needs to know the host and port the
|
---|
| 30 | # server may be found on - information currently hard-coded near the top of the
|
---|
| 31 | # script. There are also a number of Perl module API 'bindings' generated by
|
---|
| 32 | # the Thrift compilation process... currently located within the packages of
|
---|
| 33 | # the Parallel Processing extension. Note that I make use of some tie() magic
|
---|
| 34 | # so as to provide calling code with 'file handle'-like objects to interact
|
---|
| 35 | # with (print, readline etc), so that is pretty cool.
|
---|
| 36 |
|
---|
| 37 | package FileUtils::HDThriftFS;
|
---|
| 38 |
|
---|
| 39 | # Pragma
|
---|
| 40 | use strict;
|
---|
| 41 |
|
---|
| 42 | # Setup Environment
|
---|
| 43 | BEGIN
|
---|
| 44 | {
|
---|
| 45 | die "GEXTPARALLELBUILDING not set\n" unless defined $ENV{'GEXTPARALLELBUILDING'};
|
---|
| 46 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
[27532] | 47 | die "GSDLCOLLECTDIR not set\n" unless defined $ENV{'GSDLCOLLECTDIR'};
|
---|
[27478] | 48 | # We need the Perl version before continuing
|
---|
| 49 | if (!defined $ENV{'PERL_VERSION'})
|
---|
| 50 | {
|
---|
| 51 | $ENV{'PERL_VERSION'} = `perl -S $ENV{'GEXTPARALLELBUILDING'}/bin/script/perl-version.pl`;
|
---|
| 52 | }
|
---|
[27386] | 53 | die "PERL_VERSION not set\n" unless defined $ENV{'PERL_VERSION'};
|
---|
| 54 | # Bit::Vector and Thrift modules
|
---|
| 55 | unshift (@INC, $ENV{'GEXTPARALLELBUILDING'} . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $ENV{'PERL_VERSION'});
|
---|
| 56 | # ThriftFS Perl API
|
---|
[27478] | 57 | unshift (@INC, $ENV{'GEXTPARALLELBUILDING'} . '/packages/ThriftFS-0.9.0/gen-perl');
|
---|
[27386] | 58 | }
|
---|
| 59 |
|
---|
| 60 | # Modules - Core
|
---|
| 61 | use Devel::Peek;
|
---|
| 62 | use MIME::Base64;
|
---|
| 63 | use POSIX qw(floor);
|
---|
| 64 | use Symbol;
|
---|
| 65 | use Thrift::Socket;
|
---|
| 66 | use Thrift::BufferedTransport;
|
---|
| 67 | use Thrift::BinaryProtocol;
|
---|
| 68 | # Modules - Thrift
|
---|
| 69 | use HadoopFS::FileSystem;
|
---|
| 70 | # Modules - Greenstone
|
---|
| 71 | use FileUtils::HDThriftFS::ThriftFH;
|
---|
| 72 | use MIME::Base91;
|
---|
| 73 |
|
---|
| 74 | # Configuration
|
---|
[27591] | 75 | my $host = (`hostname -s` || `hostname -a` || `hostname` || $ENV{'HOSTNAME'} || 'localhost');
|
---|
| 76 | chomp($host);
|
---|
[27386] | 77 | my $port = 58660;
|
---|
[27424] | 78 | my $debug = 0;
|
---|
[27386] | 79 | my $debug_encoding = 0;
|
---|
[27652] | 80 | # Testing shows 128k is pretty optimal
|
---|
[27514] | 81 | #my $buffer_length = 4 * 1024; # 4k blocks
|
---|
| 82 | #my $buffer_length = 8 * 1024; # 8k blocks
|
---|
| 83 | #my $buffer_length = 16 * 1024; # 16K blocks
|
---|
| 84 | #my $buffer_length = 32 * 1024; # 32k blocks
|
---|
[27525] | 85 | #my $buffer_length = 64 * 1024; # 64k blocks
|
---|
[27568] | 86 | my $buffer_length = 128 * 1024; # 128k blocks
|
---|
[27514] | 87 | #my $buffer_length = 256 * 1024; # 256k blocks
|
---|
[27568] | 88 | #my $buffer_length = 512 * 1024; # 512k blocks
|
---|
[27514] | 89 | #my $buffer_length = 1024 * 1024; # 1M blocks
|
---|
| 90 | #my $buffer_length = 2048 * 1024; # 2M blocks
|
---|
[27525] | 91 | ## These cause "OUT OF MEMORY" errors on Medusa
|
---|
[27514] | 92 | #my $buffer_length = 4096 * 1024; # 4M blocks
|
---|
| 93 | #my $buffer_length = 8192 * 1024; # 8M blocks
|
---|
[27386] | 94 |
|
---|
| 95 | # Globals
|
---|
| 96 | my $transport;
|
---|
| 97 | my $thrift_client;
|
---|
| 98 |
|
---|
| 99 |
|
---|
| 100 | ## @function END()
|
---|
| 101 | #
|
---|
| 102 | # Ensure the transport layer, if open, is properly closed
|
---|
| 103 | #
|
---|
| 104 | END
|
---|
| 105 | {
|
---|
| 106 | if (defined $transport)
|
---|
| 107 | {
|
---|
| 108 | $transport->close();
|
---|
| 109 | }
|
---|
| 110 | }
|
---|
| 111 | ## END()
|
---|
| 112 |
|
---|
| 113 |
|
---|
| 114 | ## @function _establishClient()
|
---|
| 115 | #
|
---|
| 116 | sub _establishClient
|
---|
| 117 | {
|
---|
| 118 | if (!defined $thrift_client)
|
---|
| 119 | {
|
---|
[27532] | 120 | # Look for a configuration file to override the default localhost:58660
|
---|
| 121 | # settings
|
---|
| 122 | my $conf_file_path = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'etc', 'thrift.conf');
|
---|
| 123 | if (&FileUtils::fileExists($conf_file_path))
|
---|
| 124 | {
|
---|
[27547] | 125 | print " * Found Thrift configuration file:\n";
|
---|
[27532] | 126 | my $conf_raw = &FileUtils::fileGetContents($conf_file_path);
|
---|
[27591] | 127 | if ($conf_raw =~ /^([^:]*):(\d+)/)
|
---|
[27532] | 128 | {
|
---|
[27591] | 129 | my $new_host = $1;
|
---|
[27532] | 130 | $port = $2;
|
---|
[27591] | 131 | if ($new_host ne '' && $new_host ne 'localhost')
|
---|
| 132 | {
|
---|
| 133 | $host = $new_host;
|
---|
| 134 | print " - Host: " . $host . "\n";
|
---|
| 135 | }
|
---|
[27532] | 136 | print " - Port: " . $port . "\n";
|
---|
| 137 | }
|
---|
| 138 | }
|
---|
| 139 |
|
---|
[27547] | 140 | print " * Creating Thrift client connected to: $host:$port\n";
|
---|
[27386] | 141 | my $socket = Thrift::Socket->new($host, $port);
|
---|
| 142 | $socket->setSendTimeout(10000);
|
---|
| 143 | $socket->setRecvTimeout(20000);
|
---|
| 144 |
|
---|
| 145 | $transport = Thrift::BufferedTransport->new($socket);
|
---|
| 146 | my $protocol = Thrift::BinaryProtocol->new($transport);
|
---|
| 147 | $thrift_client = HadoopFS::FileSystemClient->new($protocol);
|
---|
| 148 |
|
---|
| 149 | eval { $transport->open(); };
|
---|
| 150 | if ($@)
|
---|
| 151 | {
|
---|
[27424] | 152 | &FileUtils::printError('Unable to connect: ' . $@->{message}, 1);
|
---|
[27386] | 153 | }
|
---|
| 154 | }
|
---|
| 155 | }
|
---|
| 156 | ## _establishClient()
|
---|
| 157 |
|
---|
| 158 |
|
---|
| 159 | ## @function _generateHDFSPath()
|
---|
| 160 | #
|
---|
| 161 | sub _generateHDFSPath
|
---|
| 162 | {
|
---|
| 163 | my ($path) = @_;
|
---|
| 164 | if (ref($path) ne 'HadoopFS::Pathname')
|
---|
| 165 | {
|
---|
| 166 | if ($path !~ /HDThriftFS:\/\//)
|
---|
| 167 | {
|
---|
[27481] | 168 | &FileUtils::printError('Not a valid thrift URI: ' . $path);
|
---|
[27386] | 169 | }
|
---|
| 170 | else
|
---|
| 171 | {
|
---|
[27492] | 172 | # Remove protocol and any host and port information
|
---|
| 173 | $path =~ s/HDThriftFS:\/\/[^\/]*//;
|
---|
| 174 | $path = HadoopFS::Pathname->new( { pathname => $path } );
|
---|
[27386] | 175 | }
|
---|
| 176 | }
|
---|
| 177 | return $path;
|
---|
| 178 | }
|
---|
| 179 | ## _generateHDFSPath()
|
---|
| 180 |
|
---|
| 181 |
|
---|
[27424] | 182 | ## @function _printDebug()
|
---|
| 183 | #
|
---|
| 184 | sub _printDebug
|
---|
| 185 | {
|
---|
| 186 | my ($msg) = @_;
|
---|
| 187 | if ($debug)
|
---|
| 188 | {
|
---|
| 189 | my ($package, $filename, $line, $function) = caller(1);
|
---|
| 190 | print STDERR '[DEBUG] ' . $function . ': ' . $msg . "\n";
|
---|
| 191 | }
|
---|
| 192 | }
|
---|
| 193 | ## _printDebug()
|
---|
| 194 |
|
---|
[27386] | 195 | ################################################################################
|
---|
| 196 | ################################################################################
|
---|
| 197 | ################################################################################
|
---|
| 198 |
|
---|
| 199 |
|
---|
[27424] | 200 | ## @function canRead()
|
---|
| 201 | #
|
---|
| 202 | sub canRead
|
---|
| 203 | {
|
---|
| 204 | my $path = shift(@_);
|
---|
| 205 | return &checkPermission($path, 'r', @_);
|
---|
| 206 | }
|
---|
| 207 | ## canRead()
|
---|
| 208 |
|
---|
| 209 |
|
---|
| 210 | ## @function checkPermission()
|
---|
| 211 | #
|
---|
| 212 | sub checkPermission
|
---|
| 213 | {
|
---|
| 214 | my ($path, $mode, $username, $usergroup) = @_;
|
---|
| 215 | my $offsets = {'r' => 0, 'w' => 1, 'x' => 2};
|
---|
| 216 | # - ensure we have a connection to the thrift server
|
---|
| 217 | &_establishClient();
|
---|
| 218 | # - convert the path into a proper thrift path object
|
---|
| 219 | $path = &_generateHDFSPath($path);
|
---|
| 220 | # - determine the user (defaults to current user)
|
---|
| 221 | if (!defined $username)
|
---|
| 222 | {
|
---|
| 223 | if ($ENV{'GSDLOS'} =~ /^windows$/i)
|
---|
| 224 | {
|
---|
| 225 | require Win32;
|
---|
| 226 | $username = Win32::LoginName();
|
---|
| 227 | }
|
---|
| 228 | else
|
---|
| 229 | {
|
---|
| 230 | $username = getlogin || getpwuid($<);
|
---|
| 231 | }
|
---|
| 232 | }
|
---|
| 233 | # - determine the group
|
---|
| 234 | my $usergroups = {};
|
---|
| 235 | if (defined $usergroup)
|
---|
| 236 | {
|
---|
| 237 | $usergroups = (ref $usergroup eq "HASH") ? $usergroup : {$usergroup => 1};
|
---|
| 238 | }
|
---|
| 239 | else
|
---|
| 240 | {
|
---|
| 241 | if ($ENV{'GSDLOS'} =~ /^windows$/i)
|
---|
| 242 | {
|
---|
| 243 | # dunno
|
---|
| 244 | }
|
---|
| 245 | else
|
---|
| 246 | {
|
---|
| 247 | my $raw_groups = `groups`;
|
---|
| 248 | foreach my $group ( split(/\s/, $raw_groups))
|
---|
| 249 | {
|
---|
| 250 | $usergroups->{$group} = 1;
|
---|
| 251 | }
|
---|
| 252 | }
|
---|
| 253 | }
|
---|
| 254 | # Retrieve details from the file
|
---|
| 255 | my $file_stat = $thrift_client->stat($path);
|
---|
| 256 | my $owner = $file_stat->{'owner'};
|
---|
| 257 | my $group = $file_stat->{'group'};
|
---|
| 258 | my $permissions = $file_stat->{'permission'};
|
---|
| 259 | # Begin the cascade of tests to determine if the identified user belonging to
|
---|
| 260 | # the identified group can perform 'mode' access to the file.
|
---|
| 261 | my $has_permission = 0;
|
---|
| 262 | # - start with [u]ser permission
|
---|
| 263 | if (defined $owner && $username eq $owner)
|
---|
| 264 | {
|
---|
| 265 | my $target_char = substr($permissions, $offsets->{$mode}, 1);
|
---|
| 266 | if ($mode eq $target_char)
|
---|
| 267 | {
|
---|
| 268 | $has_permission = 1;
|
---|
| 269 | }
|
---|
| 270 | }
|
---|
| 271 | # - failing that, try [g]roup level permissions
|
---|
| 272 | if (!$has_permission && defined $group && defined $usergroups->{$group})
|
---|
| 273 | {
|
---|
| 274 | my $target_char = substr($permissions, 3 + $offsets->{$mode}, 1);
|
---|
| 275 | if ($mode eq $target_char)
|
---|
| 276 | {
|
---|
| 277 | $has_permission = 1;
|
---|
| 278 | }
|
---|
| 279 | }
|
---|
| 280 | # - and finally try [o]ther level permission
|
---|
| 281 | if (!$has_permission)
|
---|
| 282 | {
|
---|
| 283 | my $target_char = substr($permissions, 6 + $offsets->{$mode}, 1);
|
---|
| 284 | if ($mode eq $target_char)
|
---|
| 285 | {
|
---|
| 286 | $has_permission = 1;
|
---|
| 287 | }
|
---|
| 288 | }
|
---|
| 289 | return $has_permission;
|
---|
| 290 | }
|
---|
| 291 | ## checkPermission
|
---|
| 292 |
|
---|
| 293 |
|
---|
[27386] | 294 | ## @function closeFileHandle()
|
---|
| 295 | #
|
---|
| 296 | sub closeFileHandle
|
---|
| 297 | {
|
---|
| 298 | my $fh_ref = shift(@_);
|
---|
[27424] | 299 | &_printDebug('');
|
---|
[27386] | 300 | close($$fh_ref);
|
---|
| 301 | untie($$fh_ref);
|
---|
| 302 | return 1;
|
---|
| 303 | }
|
---|
| 304 | ## closeFileHandle()
|
---|
| 305 |
|
---|
| 306 |
|
---|
| 307 | ## @function fileSize()
|
---|
| 308 | #
|
---|
| 309 | sub fileSize
|
---|
| 310 | {
|
---|
| 311 | my ($path, $test_op) = @_;
|
---|
| 312 | # ensure we have a connection to the thrift server
|
---|
| 313 | &_establishClient();
|
---|
| 314 | # - convert the path into a proper thrift path object
|
---|
| 315 | $path = &_generateHDFSPath($path);
|
---|
| 316 | my $file_stat = $thrift_client->stat($path);
|
---|
| 317 | return $file_stat->{length};
|
---|
| 318 | }
|
---|
| 319 | ## fileSize()
|
---|
| 320 |
|
---|
| 321 |
|
---|
| 322 | ## @function fileTest()
|
---|
| 323 | #
|
---|
| 324 | sub fileTest
|
---|
| 325 | {
|
---|
| 326 | my ($raw_path, $test_op) = @_;
|
---|
| 327 | my $result = 0;
|
---|
| 328 | # ensure we have a connection to the thrift server
|
---|
| 329 | &_establishClient();
|
---|
| 330 | # - convert the path into a proper thrift path object
|
---|
| 331 | my $path = &_generateHDFSPath($raw_path);
|
---|
| 332 | # note: symbolic linking not supported within HDFS
|
---|
| 333 | if (!defined $test_op || $test_op eq '-l')
|
---|
| 334 | {
|
---|
| 335 | $test_op = '-e';
|
---|
| 336 | }
|
---|
| 337 | if ($test_op eq '-d')
|
---|
| 338 | {
|
---|
| 339 | if ($thrift_client->exists($path))
|
---|
| 340 | {
|
---|
| 341 | my $file = $thrift_client->stat($path);
|
---|
| 342 | if ($file->{'isdir'})
|
---|
| 343 | {
|
---|
| 344 | $result = 1;
|
---|
| 345 | }
|
---|
| 346 | }
|
---|
| 347 | }
|
---|
| 348 | elsif ($test_op eq '-e')
|
---|
| 349 | {
|
---|
| 350 | if ($thrift_client->exists($path))
|
---|
| 351 | {
|
---|
| 352 | $result = 1;
|
---|
| 353 | }
|
---|
| 354 | }
|
---|
| 355 | elsif ($test_op eq '-f')
|
---|
| 356 | {
|
---|
| 357 | if ($thrift_client->exists($path))
|
---|
| 358 | {
|
---|
| 359 | my $file = $thrift_client->stat($path);
|
---|
| 360 | if (!$file->{'isdir'})
|
---|
| 361 | {
|
---|
| 362 | $result = 1;
|
---|
| 363 | }
|
---|
| 364 | }
|
---|
| 365 | }
|
---|
| 366 | else
|
---|
| 367 | {
|
---|
| 368 | &FileUtils::printError('Unknown or unsupported test mode: ' . $test_op);
|
---|
| 369 | }
|
---|
| 370 | return $result;
|
---|
| 371 | }
|
---|
| 372 | ## fileTest()
|
---|
| 373 |
|
---|
| 374 |
|
---|
| 375 | ## @function filenameConcatenate()
|
---|
| 376 | #
|
---|
| 377 | sub filenameConcatenate
|
---|
| 378 | {
|
---|
| 379 | my $protocol = shift(@_);
|
---|
| 380 | my $filename = join('/', @_);
|
---|
| 381 | # remove repeated slashes
|
---|
| 382 | $filename =~ s/[\/]+/\//g;
|
---|
| 383 | # append protocol (which may cause multiple slashes)
|
---|
| 384 | $filename = $protocol . '/' . $filename;
|
---|
| 385 | # strip any trailing slashes
|
---|
| 386 | $filename =~ s/[\\\/]$//;
|
---|
| 387 | return $filename;
|
---|
| 388 | }
|
---|
| 389 | ## filenameConcatenate()
|
---|
| 390 |
|
---|
| 391 |
|
---|
[27424] | 392 | ## @function isFilenameAbsolute()
|
---|
| 393 | #
|
---|
| 394 | sub isFilenameAbsolute
|
---|
| 395 | {
|
---|
| 396 | # File paths against HDFS must be.
|
---|
| 397 | return 1;
|
---|
| 398 | }
|
---|
| 399 | # isFilenameAbsolute()
|
---|
| 400 |
|
---|
| 401 |
|
---|
[27525] | 402 | ## @function isHDFS
|
---|
| 403 | #
|
---|
| 404 | sub isHDFS
|
---|
| 405 | {
|
---|
| 406 | return 1;
|
---|
| 407 | }
|
---|
| 408 | ## isHDFS()
|
---|
| 409 |
|
---|
| 410 |
|
---|
[27386] | 411 | ## @function makeDirectory()
|
---|
| 412 | #
|
---|
| 413 | sub makeDirectory
|
---|
| 414 | {
|
---|
| 415 | my ($raw_path) = @_;
|
---|
| 416 | my $result = 0;
|
---|
| 417 | # ensure we have a connection to the thrift server
|
---|
| 418 | &_establishClient();
|
---|
| 419 | # - convert the path into a proper thrift path object
|
---|
| 420 | my $path = &_generateHDFSPath($raw_path);
|
---|
| 421 | if (!&fileTest($path, '-d'))
|
---|
| 422 | {
|
---|
| 423 | # - create the directory
|
---|
| 424 | $thrift_client->mkdirs($path);
|
---|
| 425 | }
|
---|
| 426 | # - check that it exists
|
---|
| 427 | return (&fileTest($path, '-d'));
|
---|
| 428 | }
|
---|
| 429 | ## makeDirectory()
|
---|
| 430 |
|
---|
| 431 |
|
---|
| 432 | ## @function modificationTime()
|
---|
| 433 | #
|
---|
| 434 | sub modificationTime
|
---|
| 435 | {
|
---|
| 436 | my ($path) = @_;
|
---|
| 437 | # ensure we have a connection to the thrift server
|
---|
| 438 | &_establishClient();
|
---|
| 439 | # - convert the path into a proper thrift path object
|
---|
| 440 | $path = &_generateHDFSPath($path);
|
---|
| 441 | my $file_stat = $thrift_client->stat($path);
|
---|
| 442 | return floor($file_stat->{modificationTime} / 1000);
|
---|
| 443 | }
|
---|
| 444 | ## modificationTime()
|
---|
| 445 |
|
---|
| 446 |
|
---|
| 447 | ## @function openFileHandle()
|
---|
| 448 | #
|
---|
| 449 | sub openFileHandle
|
---|
| 450 | {
|
---|
| 451 | my ($raw_path, $mode, $fh_ref) = @_;
|
---|
[27424] | 452 | &_printDebug('path: ' . $raw_path . ', mode: ' . $mode . ', fh_ref');
|
---|
[27386] | 453 | # ensure we have a connection to the thrift server
|
---|
| 454 | &_establishClient();
|
---|
| 455 | #rint STDERR "DEBUG: openFileHandle($raw_path, $mode, fh_ref)\n";
|
---|
| 456 | my $path = &_generateHDFSPath($raw_path);
|
---|
| 457 | my $fh = gensym();
|
---|
| 458 | tie(*$fh, "FileUtils::HDThriftFS::ThriftFH", $thrift_client);
|
---|
| 459 | open($fh, $path, $mode) or die("Failed to open thriftfs");
|
---|
| 460 | $$fh_ref = $fh;
|
---|
| 461 | return 1;
|
---|
| 462 | }
|
---|
| 463 | ## openFileHandle()
|
---|
| 464 |
|
---|
| 465 |
|
---|
| 466 | ## @function readDirectory()
|
---|
| 467 | #
|
---|
| 468 | sub readDirectory
|
---|
| 469 | {
|
---|
| 470 | my ($raw_path) = @_;
|
---|
| 471 | my @files;
|
---|
| 472 | # ensure we have a connection to the thrift server
|
---|
| 473 | &_establishClient();
|
---|
| 474 | my $path = &_generateHDFSPath($raw_path);
|
---|
| 475 | my $raw_files = $thrift_client->listStatus($path);
|
---|
| 476 | if ($raw_files && @{$raw_files} > 0)
|
---|
| 477 | {
|
---|
| 478 | foreach my $file_stat (@{$raw_files})
|
---|
| 479 | {
|
---|
| 480 | my $file_path = $file_stat->{'path'};
|
---|
| 481 | my ($filename) = $file_path =~ /([^\\\/]+)$/;
|
---|
| 482 | push(@files, $filename);
|
---|
| 483 | }
|
---|
| 484 | }
|
---|
| 485 | return \@files;
|
---|
| 486 | }
|
---|
| 487 | ## readDirectory()
|
---|
| 488 |
|
---|
| 489 |
|
---|
| 490 | ## @function removeFiles()
|
---|
| 491 | #
|
---|
| 492 | sub removeFiles
|
---|
| 493 | {
|
---|
| 494 | my ($path, $recursive) = @_;
|
---|
| 495 | my $result = 0;
|
---|
| 496 | if (!defined $recursive)
|
---|
| 497 | {
|
---|
| 498 | $recursive = 0;
|
---|
| 499 | }
|
---|
| 500 | # ensure we have a connection to the thrift server
|
---|
| 501 | &_establishClient();
|
---|
| 502 | # - convert the path into a proper thrift path object as necessary
|
---|
| 503 | $path = &_generateHDFSPath($path);
|
---|
| 504 | if ($thrift_client->exists($path) && ($recursive || &fileTest($path, '-f')))
|
---|
| 505 | {
|
---|
| 506 | $thrift_client->rm($path, $recursive);
|
---|
| 507 | $result = !$thrift_client->exists($path);
|
---|
| 508 | }
|
---|
| 509 | return $result;
|
---|
| 510 | }
|
---|
| 511 | ## removeFiles()
|
---|
| 512 |
|
---|
| 513 |
|
---|
| 514 | ## @function removeFilesFiltered()
|
---|
| 515 | #
|
---|
| 516 | sub removeFilesFiltered
|
---|
| 517 | {
|
---|
| 518 | my ($paths, $accept_re, $reject_re) = @_;
|
---|
| 519 | # ensure we have a connection to the thrift server
|
---|
| 520 | &_establishClient();
|
---|
| 521 | # Perform a depth first, recursive, removal of files and directories that
|
---|
| 522 | # match the given accept and reject patterns
|
---|
| 523 | my @paths_array = (ref $paths eq "ARRAY") ? @$paths : ($paths);
|
---|
| 524 | my $num_removed = 0;
|
---|
| 525 | foreach my $raw_path (@paths_array)
|
---|
| 526 | {
|
---|
| 527 | # remove trailing slashes
|
---|
| 528 | $raw_path =~ s/[\/\\]+$//;
|
---|
| 529 | my $path = &_generateHDFSPath($raw_path);
|
---|
| 530 | if (!$thrift_client->exists($path))
|
---|
| 531 | {
|
---|
| 532 | print STDERR "HDThriftFS::removeFilesFiltered() path does not exist: " . $raw_path . "\n";
|
---|
| 533 | }
|
---|
| 534 | elsif (&fileTest($path, '-d'))
|
---|
| 535 | {
|
---|
| 536 | my @files = @{&readDirectory($path)};
|
---|
| 537 | foreach my $file (@files)
|
---|
| 538 | {
|
---|
| 539 | my $child_path = $raw_path . '/' . $file;
|
---|
| 540 | $num_removed += &removeFilesFiltered($child_path, $accept_re, $reject_re);
|
---|
| 541 | }
|
---|
| 542 | if (!defined $accept_re && !defined $reject_re)
|
---|
| 543 | {
|
---|
| 544 | # remove this directory - non-recursively so that the command fails
|
---|
| 545 | # if there are (somehow) still files contained within
|
---|
| 546 | $thrift_client->rm($path, 0);
|
---|
| 547 | if ($thrift_client->exists($path))
|
---|
| 548 | {
|
---|
| 549 | print STDERR "HDThriftFS::removeFilesFiltered() couldn't remove directory: " . $raw_path . "\n";
|
---|
| 550 | }
|
---|
| 551 | else
|
---|
| 552 | {
|
---|
| 553 | $num_removed++;
|
---|
| 554 | }
|
---|
| 555 | }
|
---|
| 556 | }
|
---|
| 557 | else
|
---|
| 558 | {
|
---|
| 559 | if (defined $reject_re && ($raw_path =~ m/$reject_re/))
|
---|
| 560 | {
|
---|
| 561 | next;
|
---|
| 562 | }
|
---|
| 563 | if ((!defined $accept_re) || ($raw_path =~ m/$accept_re/))
|
---|
| 564 | {
|
---|
| 565 | # remove this file
|
---|
| 566 | $thrift_client->rm($path, 0);
|
---|
| 567 | if ($thrift_client->exists($path))
|
---|
| 568 | {
|
---|
| 569 | print STDERR "HDThriftFS::removeFilesFiltered() couldn't remove file: " . $raw_path . "\n";
|
---|
| 570 | }
|
---|
| 571 | else
|
---|
| 572 | {
|
---|
| 573 | $num_removed++;
|
---|
| 574 | }
|
---|
| 575 | }
|
---|
| 576 | }
|
---|
| 577 | }
|
---|
| 578 | return $num_removed;
|
---|
| 579 | }
|
---|
| 580 | ## removeFilesFiltered()
|
---|
| 581 |
|
---|
| 582 |
|
---|
| 583 | ## @function removeFilesRecursive()
|
---|
| 584 | #
|
---|
| 585 | sub removeFilesRecursive
|
---|
| 586 | {
|
---|
| 587 | my ($path) = @_;
|
---|
| 588 | # use the more general removeFilesFiltered() function with no accept or reject
|
---|
| 589 | # expressions
|
---|
| 590 | return &removeFilesFiltered($path, undef, undef);
|
---|
| 591 | }
|
---|
| 592 | ## removeFilesRecursive()
|
---|
| 593 |
|
---|
| 594 |
|
---|
| 595 | ## @function supportsSymbolicLink
|
---|
| 596 | #
|
---|
| 597 | sub supportsSymbolicLink
|
---|
| 598 | {
|
---|
| 599 | return 0;
|
---|
| 600 | }
|
---|
| 601 | ## supportsSymbolicLink()
|
---|
| 602 |
|
---|
| 603 |
|
---|
| 604 | ## @function transferFile()
|
---|
| 605 | #
|
---|
| 606 | sub transferFile
|
---|
| 607 | {
|
---|
| 608 | my ($mode, $src, $dst) = @_;
|
---|
| 609 | # ensure we have a connection to the thrift server
|
---|
| 610 | &_establishClient();
|
---|
| 611 | #rint STDERR "transferFile($mode, $src, $dst)\n";
|
---|
| 612 | my $src_path = &_generateHDFSPath($src);
|
---|
| 613 | my $dst_path = &_generateHDFSPath($dst);
|
---|
| 614 | if (&fileTest($dst_path, '-d'))
|
---|
| 615 | {
|
---|
| 616 | my ($filename) = $src =~ /([^\\\/]+)$/;
|
---|
| 617 | $dst .= '/' . $filename;
|
---|
| 618 | $dst_path = &_generateHDFSPath($dst);
|
---|
| 619 | }
|
---|
| 620 | if (!$thrift_client->exists($src_path))
|
---|
| 621 | {
|
---|
| 622 | &FileUtils::printError('Source file (during ' . $mode . ') does not exist: ' . $src);
|
---|
| 623 | return 0;
|
---|
| 624 | }
|
---|
| 625 | if ($thrift_client->exists($dst_path))
|
---|
| 626 | {
|
---|
| 627 | &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
|
---|
| 628 | return 0;
|
---|
| 629 | }
|
---|
| 630 | # what happens next depends on the mode, and is either very easy or really
|
---|
| 631 | # hard
|
---|
| 632 | if ($mode eq 'MOVE')
|
---|
| 633 | {
|
---|
| 634 | $thrift_client->rename($src_path, $dst_path);
|
---|
| 635 | }
|
---|
| 636 | elsif ($mode eq 'COPY')
|
---|
| 637 | {
|
---|
| 638 | # Open the src file for reading
|
---|
| 639 | #rint STDERR "DEBUG: FHIN opened (should be 'r'): $src\n";
|
---|
| 640 | my $fhin = $thrift_client->open($src_path);
|
---|
| 641 | # Create the dst file for writing
|
---|
| 642 | #rint STDERR "DEBUG: FHOUT created (should be 'w'): $dst\n";
|
---|
| 643 | my $fhout = $thrift_client->create($dst_path);
|
---|
| 644 | # Read all of src file writing to dst file
|
---|
| 645 | # - this is where things have the potential to go wrong, as it doesn't seem
|
---|
| 646 | # thrift supports writing bytes
|
---|
| 647 | # - only strings. May need to see if I can make Perl behave using black
|
---|
| 648 | # magic flags (marking string as binary etc) It'll work fine for text
|
---|
| 649 | # files though
|
---|
| 650 | my $data = undef;
|
---|
| 651 | my $offset = 0;
|
---|
| 652 | # Read 4K blocks at a time
|
---|
[27514] | 653 | while ($data = $thrift_client->read($fhin, $offset, $buffer_length))
|
---|
[27386] | 654 | {
|
---|
| 655 | $thrift_client->write($fhout, $data);
|
---|
[27514] | 656 | $offset += $buffer_length;
|
---|
| 657 | if (length ($data) < $buffer_length)
|
---|
[27386] | 658 | {
|
---|
| 659 | last;
|
---|
| 660 | }
|
---|
| 661 | }
|
---|
| 662 | # Close files
|
---|
| 663 | $thrift_client->close($fhout);
|
---|
| 664 | $thrift_client->close($fhin);
|
---|
| 665 | }
|
---|
| 666 | my $result = ($thrift_client->exists($dst_path));
|
---|
| 667 | #rint STDERR "transferFile() => $result\n";
|
---|
| 668 | return $result;
|
---|
| 669 | }
|
---|
| 670 | ## transferFile()
|
---|
| 671 |
|
---|
| 672 |
|
---|
| 673 | ## @function transferFileFromLocal()
|
---|
| 674 | #
|
---|
| 675 | sub transferFileFromLocal
|
---|
| 676 | {
|
---|
| 677 | my ($mode, $src, $dst) = @_;
|
---|
| 678 | # ensure we have a connection to the thrift server
|
---|
| 679 | &_establishClient();
|
---|
| 680 | # destination is remote
|
---|
| 681 | my $dst_path = &_generateHDFSPath($dst);
|
---|
| 682 | if (&fileTest($dst_path, '-d'))
|
---|
| 683 | {
|
---|
| 684 | my ($filename) = $src =~ /([^\\\/]+)$/;
|
---|
| 685 | $dst .= '/' . $filename;
|
---|
| 686 | $dst_path = &_generateHDFSPath($dst);
|
---|
| 687 | }
|
---|
| 688 | # can't replace - if the file already exists
|
---|
| 689 | if (&fileTest($dst_path, '-f'))
|
---|
| 690 | {
|
---|
| 691 | &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
|
---|
| 692 | return 0;
|
---|
| 693 | }
|
---|
| 694 | # copy the file
|
---|
| 695 | my $fhin;
|
---|
| 696 | open($fhin, '<:raw', $src) or die("Failed to open file for reading: " . $src . " (" . $! . ")");
|
---|
| 697 | my $decoded = '';
|
---|
| 698 | my $fhout = $thrift_client->create($dst_path);
|
---|
[27514] | 699 | while (read($fhin, $decoded, $buffer_length))
|
---|
[27386] | 700 | {
|
---|
| 701 | if ($debug_encoding)
|
---|
| 702 | {
|
---|
| 703 | print STDERR "Writing Data: \n=== START ===\n"; Dump($decoded); print STDERR "\n=== END ===\n\n";
|
---|
| 704 | }
|
---|
| 705 | # Base64 encode to protect binary
|
---|
| 706 | #my $encoded = encode_base64($decoded);
|
---|
| 707 | # Base91 encode to protect binary - we add a Byte Order Marker so the
|
---|
| 708 | # Thrift Server can detect the need to decode the string sent
|
---|
| 709 | my $encoded = MIME::Base91::encode($decoded);
|
---|
| 710 | if ($debug_encoding)
|
---|
| 711 | {
|
---|
| 712 | print STDERR "Encoded: \n=== START ===\n"; Dump($encoded); print STDERR "\n=== END ===\n\n";
|
---|
| 713 | }
|
---|
| 714 | $thrift_client->write($fhout, $encoded);
|
---|
| 715 | }
|
---|
| 716 | close($fhin);
|
---|
| 717 | $thrift_client->close($fhout);
|
---|
| 718 | # in general, the transfer has worked if the destination file exists
|
---|
| 719 | my $result = $thrift_client->exists($dst_path);
|
---|
| 720 | # if moving, remove the source file from the local filesystem
|
---|
| 721 | if ($mode eq 'MOVE')
|
---|
| 722 | {
|
---|
| 723 | unlink($src);
|
---|
| 724 | # update result to reflect if we successfully removed the src file
|
---|
| 725 | $result = $result && (!-f $src);
|
---|
| 726 | }
|
---|
| 727 | return $result
|
---|
| 728 | }
|
---|
| 729 | ## transferFileFromLocal()
|
---|
| 730 |
|
---|
| 731 |
|
---|
| 732 | ## @function transferFileToLocal()
|
---|
| 733 | #
|
---|
| 734 | sub transferFileToLocal
|
---|
| 735 | {
|
---|
| 736 | my ($mode, $src, $dst) = @_;
|
---|
| 737 | # ensure we have a connection to the thrift server
|
---|
| 738 | &_establishClient();
|
---|
| 739 | # source is remote
|
---|
| 740 | my $src_path = &_generateHDFSPath($src);
|
---|
| 741 | if (!$thrift_client->exists($src_path))
|
---|
| 742 | {
|
---|
| 743 | &FileUtils::printError('Source file (during ' . $mode . ') does not exist: ' . $src);
|
---|
| 744 | return 0;
|
---|
| 745 | }
|
---|
| 746 | if (-d $dst)
|
---|
| 747 | {
|
---|
| 748 | my ($filename) = $src =~ /([^\\\/]+)$/;
|
---|
| 749 | $dst .= '/' . $filename;
|
---|
| 750 | }
|
---|
| 751 | if (-e $dst)
|
---|
| 752 | {
|
---|
| 753 | &FileUtils::printError('Destination file (during ' . $mode . ') already exists: ' . $dst);
|
---|
| 754 | return 0;
|
---|
| 755 | }
|
---|
| 756 | # open local file
|
---|
| 757 | my $fhout;
|
---|
| 758 | my $encoded = undef;
|
---|
| 759 | my $offset = 0;
|
---|
| 760 | open($fhout, '>:raw', $dst) or die("Failed to open file for writing: " . $dst);
|
---|
| 761 | my $fhin = $thrift_client->open($src_path);
|
---|
[27652] | 762 | # Read buffer_length *decoded* bytes - which means there may be a larger
|
---|
| 763 | # number of *encoded* bytes returned
|
---|
[27514] | 764 | while ($encoded = $thrift_client->read($fhin, $offset, $buffer_length))
|
---|
[27386] | 765 | {
|
---|
| 766 | if ($debug_encoding)
|
---|
| 767 | {
|
---|
| 768 | print STDERR "Reading Data: \n=== START ===\n"; Dump($encoded); print STDERR "\n=== END ===\n\n";
|
---|
| 769 | }
|
---|
| 770 | my $decoded = MIME::Base91::decode($encoded);
|
---|
| 771 | if ($debug_encoding)
|
---|
| 772 | {
|
---|
| 773 | print STDERR "Decoded: \n=== START ===\n"; Dump($decoded); print STDERR "\n=== END ===\n\n";
|
---|
| 774 | }
|
---|
| 775 | print $fhout $decoded;
|
---|
[27514] | 776 | if (length ($decoded) < $buffer_length)
|
---|
| 777 | {
|
---|
| 778 | last;
|
---|
| 779 | }
|
---|
| 780 | else
|
---|
| 781 | {
|
---|
| 782 | $offset += $buffer_length;
|
---|
| 783 | }
|
---|
[27386] | 784 | }
|
---|
| 785 | close($fhout);
|
---|
| 786 | $thrift_client->close($fhin);
|
---|
| 787 | # in general, the transfer has worked if the destination file exists
|
---|
| 788 | my $result = (-f $dst);
|
---|
| 789 | # if moving, remove the source file from the HDFS filesystem
|
---|
| 790 | if ($mode eq 'MOVE')
|
---|
| 791 | {
|
---|
| 792 | $thrift_client->rm($src_path, 0);
|
---|
| 793 | # update result to reflect if we successfully removed the src file
|
---|
| 794 | $result = $result && !$thrift_client->exists($src_path);
|
---|
| 795 | }
|
---|
| 796 | return $result;
|
---|
| 797 | }
|
---|
| 798 | ## transferFileToLocal()
|
---|
| 799 |
|
---|
| 800 |
|
---|
| 801 | 1;
|
---|