source: gs2-extensions/parallel-building/trunk/src/perllib/util.pm@ 26961

Last change on this file since 26961 was 26961, checked in by jmt12, 11 years ago

Several file handling functions extended to be aware of and support HDFS files. This means noticing paths starting with hdfs:// protocol (and preserving that protocol through filename_cat() etc) and calling 'hadoop fs <action>' to handle them appropriately. The most tricksie functions are those that open filehandles and get file status

File size: 61.5 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29
30use Encode;
31use File::Copy;
32use File::Basename;
33# Config for getting the perlpath in the recommended way, though it uses paths that are
34# hard-coded into the Config file that's generated upon configuring and compiling perl.
35# $^X works better in some cases to return the path to perl used to launch the script,
36# but if launched with plain "perl" (no full-path), that will be just what it returns.
37use Config;
38
39# removes files (but not directories)
40sub rm {
41 my (@files) = @_;
42
43 my @filefiles = ();
44
45 # make sure the files we want to delete exist
46 # and are regular files
47 foreach my $file (@files) {
48
49 if (&util::isHDFS($file))
50 {
51 &util::executeHDFSCommand('rm', $file);
52 }
53 else
54 {
55 if (!-e $file) {
56 print STDERR "util::rm $file does not exist\n";
57 } elsif ((!-f $file) && (!-l $file)) {
58 print STDERR "util::rm $file is not a regular (or symbolic) file\n";
59 } else {
60 push (@filefiles, $file);
61 }
62 }
63 }
64
65 # remove the files
66 my $numremoved = unlink @filefiles;
67
68 # check to make sure all of them were removed
69 if ($numremoved != scalar(@filefiles)) {
70 print STDERR "util::rm Not all files were removed\n";
71 }
72}
73
74# removes files (but not directories) - can rename this to the default
75# "rm" subroutine when debugging the deletion of individual files.
76sub rm_debug {
77 my (@files) = @_;
78 my @filefiles = ();
79
80 # make sure the files we want to delete exist
81 # and are regular files
82 foreach my $file (@files) {
83 if (!-e $file) {
84 print STDERR "util::rm $file does not exist\n";
85 } elsif ((!-f $file) && (!-l $file)) {
86 print STDERR "util::rm $file is not a regular (or symbolic) file\n";
87 } else { # debug message
88 unlink($file) or warn "Could not delete file $file: $!\n";
89 }
90 }
91}
92
93
94# recursive removal
95sub filtered_rm_r {
96 my ($files,$file_accept_re,$file_reject_re) = @_;
97
98# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(2);
99# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
100# print STDERR "** Calling method (2): $lcfilename:$cline $cpackage->$csubr\n";
101
102 my @files_array = (ref $files eq "ARRAY") ? @$files : ($files);
103
104 # recursively remove the files
105 foreach my $file (@files_array) {
106
107 # HDFS support
108 if (&util::isHDFS($file))
109 {
110 # HDFS doesn't really lend itself to a choosy delete, unless you want
111 # it to be really, really, unbearably slow.
112 &util::executeHDFSCommand('rmr', $file);
113 next;
114 }
115
116
117 $file =~ s/[\/\\]+$//; # remove trailing slashes
118
119 if (!-e $file) {
120 print STDERR "util::filtered_rm_r $file does not exist\n";
121
122 } elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link
123 # get the contents of this directory
124 if (!opendir (INDIR, $file)) {
125 print STDERR "util::filtered_rm_r could not open directory $file\n";
126 } else {
127 my @filedir = grep (!/^\.\.?$/, readdir (INDIR));
128 closedir (INDIR);
129
130 # remove all the files in this directory
131 map {$_="$file/$_";} @filedir;
132 &filtered_rm_r (\@filedir,$file_accept_re,$file_reject_re);
133
134 if (!defined $file_accept_re && !defined $file_reject_re) {
135 # remove this directory
136 if (!rmdir $file) {
137 print STDERR "util::filtered_rm_r couldn't remove directory $file\n";
138 }
139 }
140 }
141 } else {
142 next if (defined $file_reject_re && ($file =~ m/$file_reject_re/));
143
144 if ((!defined $file_accept_re) || ($file =~ m/$file_accept_re/)) {
145 # remove this file
146 &rm ($file);
147 }
148 }
149 }
150}
151
152
153# recursive removal
154sub rm_r
155{
156 my (@files) = @_;
157 # use the more general (but reterospectively written function
158 # filtered_rm_r function()
159 filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
160}
161
162
163
164
165# moves a file or a group of files
166sub mv {
167 my $dest = pop (@_);
168 my (@srcfiles) = @_;
169
170 # moving a file within or into HDFS
171 if (&util::isHDFS($dest))
172 {
173 foreach my $src (@srcfiles)
174 {
175 if (&util::isHDFS($src))
176 {
177 &util::executeHDFSCommand('mv', $src, $dest);
178 }
179 else
180 {
181 &util::executeHDFSCommand('put', $src, $dest);
182 &util::rm_r($src);
183 }
184 }
185 return;
186 }
187
188 # remove trailing slashes from source and destination files
189 $dest =~ s/[\\\/]+$//;
190 map {$_ =~ s/[\\\/]+$//;} @srcfiles;
191
192 # a few sanity checks
193 if (scalar (@srcfiles) == 0) {
194 print STDERR "util::mv no destination directory given\n";
195 return;
196 } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
197 print STDERR "util::mv if multiple source files are given the ".
198 "destination must be a directory\n";
199 return;
200 }
201
202 # move the files
203 foreach my $file (@srcfiles) {
204
205 # moving a file out of HDFS
206 if (&util::isHDFS($file))
207 {
208 &util::executeHDFSCommand('get', $file, $dest);
209 &util::rm_r($file);
210 next;
211 }
212
213 my $tempdest = $dest;
214 if (-d $tempdest) {
215 my ($filename) = $file =~ /([^\\\/]+)$/;
216 $tempdest .= "/$filename";
217 }
218 if (!-e $file) {
219 print STDERR "util::mv $file does not exist\n";
220 } else {
221 if(!rename ($file, $tempdest)) {
222 print STDERR "**** Failed to rename $file to $tempdest\n";
223 &File::Copy::copy($file, $tempdest);
224 &rm($file);
225 }
226 elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
227 #print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
228 if(!-e $tempdest) {
229 print STDERR "@@@@ ERROR: $tempdest does not exist\n";
230 }
231 # Sometimes the rename operation fails (as does File::Copy::move).
232 # This turns out to be because the files are hardlinked.
233 # Need to do a copy-delete in this case, however, the copy step is not necessary:
234 # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
235 #&File::Copy::copy($file, $tempdest);
236
237 &rm($file);
238 }
239 }
240 }
241}
242
243# Move the contents of source directory into target directory
244# (as opposed to merely replacing target dir with the src dir)
245# This can overwrite any files with duplicate names in the target
246# but other files and folders in the target will continue to exist
247sub mv_dir_contents {
248 my ($src_dir, $dest_dir) = @_;
249
250 # Obtain listing of all files within src_dir
251 # Note that readdir lists relative paths, as well as . and ..
252 opendir(DIR, "$src_dir");
253 my @files= readdir(DIR);
254 close(DIR);
255
256 my @full_path_files = ();
257 foreach my $file (@files) {
258 # process all except . and ..
259 unless($file eq "." || $file eq "..") {
260
261 my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
262
263 # construct absolute paths
264 $file = &filename_cat($src_dir, $file); # $file is now an absolute path
265
266 # Recurse on directories which have an equivalent in target dest_dir
267 # If $file is a directory that already exists in target $dest_dir,
268 # then a simple move operation will fail (definitely on Windows).
269 if(-d $file && -d $dest_subdir) {
270 #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
271 &mv_dir_contents($file, $dest_subdir);
272
273 # now all content is moved across, delete empty dir in source folder
274 if(&is_dir_empty($file)) {
275 if (!rmdir $file) {
276 print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
277 }
278 } else { # error
279 print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
280 }
281 } else { # process files and any directories that don't already exist with a simple move
282 push(@full_path_files, $file);
283 }
284 }
285 }
286
287 if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
288 &mk_dir($dest_dir);
289 }
290
291 #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n";
292
293 if(@full_path_files) { # if non-empty, there's something to copy across
294 &mv(@full_path_files, $dest_dir);
295 }
296}
297
298
299# copies a file or a group of files
300sub cp
301{
302 my $dest = pop (@_);
303 my (@srcfiles) = @_;
304
305 # remove trailing slashes from source and destination files
306 $dest =~ s/[\\\/]+$//;
307 map {$_ =~ s/[\\\/]+$//;} @srcfiles;
308
309 # a few sanity checks
310 if (scalar (@srcfiles) == 0)
311 {
312 print STDERR "util::cp no destination directory given\n";
313 return 0;
314 }
315 elsif ((scalar (@srcfiles) > 1) && (!&util::dir_exists($dest)))
316 {
317 print STDERR "util::cp if multiple source files are given the destination must be a directory\n";
318 return 0;
319 }
320
321 # copying a file into or within HDFS
322 if (&util::isHDFS($dest))
323 {
324 foreach my $src (@srcfiles)
325 {
326 &util::executeHDFSCommand('put', $src, $dest);
327 &util::rm_r($src);
328 }
329 return;
330 }
331
332 # copy the files
333 foreach my $file (@srcfiles)
334 {
335 my $tempdest = $dest;
336 if (&util::dir_exists($tempdest))
337 {
338 my ($filename) = $file =~ /([^\\\/]+)$/;
339 $tempdest .= "/$filename";
340 }
341 if (!&util::file_exists($file))
342 {
343 if (&util::dir_exists($file))
344 {
345 print STDERR "util::cp $file is not a plain file\n";
346 }
347 else
348 {
349 print STDERR "util::cp $file does not exist\n";
350 }
351 }
352 elsif (&util::isHDFS($file))
353 {
354 &util::executeHDFSCommand('get', $file, $dest);
355 }
356 else
357 {
358 &File::Copy::copy ($file, $tempdest);
359 }
360 }
361}
362
363
364
365# recursively copies a file or group of files
366# syntax: cp_r (sourcefiles, destination directory)
367# destination must be a directory - to copy one file to
368# another use cp instead
369sub cp_r {
370 my $dest = pop (@_);
371 my (@srcfiles) = @_;
372
373 # a few sanity checks
374 if (scalar (@srcfiles) == 0) {
375 print STDERR "util::cp_r no destination directory given\n";
376 return;
377 } elsif (-f $dest) {
378 print STDERR "util::cp_r destination must be a directory\n";
379 return;
380 }
381
382 # create destination directory if it doesn't exist already
383 if (! -d $dest) {
384 my $store_umask = umask(0002);
385 mkdir ($dest, 0777);
386 umask($store_umask);
387 }
388
389 # copy the files
390 foreach my $file (@srcfiles) {
391
392 if (!-e $file) {
393 print STDERR "util::cp_r $file does not exist\n";
394
395 } elsif (-d $file) {
396 # make the new directory
397 my ($filename) = $file =~ /([^\\\/]*)$/;
398 $dest = &util::filename_cat ($dest, $filename);
399 my $store_umask = umask(0002);
400 mkdir ($dest, 0777);
401 umask($store_umask);
402
403 # get the contents of this directory
404 if (!opendir (INDIR, $file)) {
405 print STDERR "util::cp_r could not open directory $file\n";
406 } else {
407 my @filedir = readdir (INDIR);
408 closedir (INDIR);
409 foreach my $f (@filedir) {
410 next if $f =~ /^\.\.?$/;
411 # copy all the files in this directory
412 my $ff = &util::filename_cat ($file, $f);
413 &cp_r ($ff, $dest);
414 }
415 }
416
417 } else {
418 &cp($file, $dest);
419 }
420 }
421}
422# recursively copies a file or group of files
423# syntax: cp_r (sourcefiles, destination directory)
424# destination must be a directory - to copy one file to
425# another use cp instead
426sub cp_r_nosvn {
427 my $dest = pop (@_);
428 my (@srcfiles) = @_;
429
430 # a few sanity checks
431 if (scalar (@srcfiles) == 0) {
432 print STDERR "util::cp_r no destination directory given\n";
433 return;
434 } elsif (-f $dest) {
435 print STDERR "util::cp_r destination must be a directory\n";
436 return;
437 }
438
439 # create destination directory if it doesn't exist already
440 if (! -d $dest) {
441 my $store_umask = umask(0002);
442 mkdir ($dest, 0777);
443 umask($store_umask);
444 }
445
446 # copy the files
447 foreach my $file (@srcfiles) {
448
449 if (!-e $file) {
450 print STDERR "util::cp_r $file does not exist\n";
451
452 } elsif (-d $file) {
453 # make the new directory
454 my ($filename) = $file =~ /([^\\\/]*)$/;
455 $dest = &util::filename_cat ($dest, $filename);
456 my $store_umask = umask(0002);
457 mkdir ($dest, 0777);
458 umask($store_umask);
459
460 # get the contents of this directory
461 if (!opendir (INDIR, $file)) {
462 print STDERR "util::cp_r could not open directory $file\n";
463 } else {
464 my @filedir = readdir (INDIR);
465 closedir (INDIR);
466 foreach my $f (@filedir) {
467 next if $f =~ /^\.\.?$/;
468 next if $f =~ /^\.svn$/;
469 # copy all the files in this directory
470 my $ff = &util::filename_cat ($file, $f);
471 &cp_r ($ff, $dest);
472 }
473 }
474
475 } else {
476 &cp($file, $dest);
477 }
478 }
479}
480
481# copies a directory and its contents, excluding subdirectories, into a new directory
482sub cp_r_toplevel {
483 my $dest = pop (@_);
484 my (@srcfiles) = @_;
485
486 # a few sanity checks
487 if (scalar (@srcfiles) == 0) {
488 print STDERR "util::cp_r no destination directory given\n";
489 return;
490 } elsif (-f $dest) {
491 print STDERR "util::cp_r destination must be a directory\n";
492 return;
493 }
494
495 # create destination directory if it doesn't exist already
496 if (! -d $dest) {
497 my $store_umask = umask(0002);
498 mkdir ($dest, 0777);
499 umask($store_umask);
500 }
501
502 # copy the files
503 foreach my $file (@srcfiles) {
504
505 if (!-e $file) {
506 print STDERR "util::cp_r $file does not exist\n";
507
508 } elsif (-d $file) {
509 # make the new directory
510 my ($filename) = $file =~ /([^\\\/]*)$/;
511 $dest = &util::filename_cat ($dest, $filename);
512 my $store_umask = umask(0002);
513 mkdir ($dest, 0777);
514 umask($store_umask);
515
516 # get the contents of this directory
517 if (!opendir (INDIR, $file)) {
518 print STDERR "util::cp_r could not open directory $file\n";
519 } else {
520 my @filedir = readdir (INDIR);
521 closedir (INDIR);
522 foreach my $f (@filedir) {
523 next if $f =~ /^\.\.?$/;
524
525 # copy all the files in this directory, but not directories
526 my $ff = &util::filename_cat ($file, $f);
527 if (-f $ff) {
528 &cp($ff, $dest);
529 #&cp_r ($ff, $dest);
530 }
531 }
532 }
533
534 } else {
535 &cp($file, $dest);
536 }
537 }
538}
539
540# /** @function mk_dir()
541# * Extend mkdir to allow it to silently fail in the case where code is
542# * 'competing' to create a directory first.
543# * @param $dir the full path of the directory to create
544# * @param $can_fail 1 if the mkdir can fail silently (optional)
545# * @return 1 on success, 0 on failure
546# */
547sub mk_dir
548{
549 my ($dir, $can_fail) = @_;
550 my $mkdir_ok = 0;
551 if (&util::isHDFS($dir))
552 {
553 # unhelpfully HDFS mkdir returns 0 on success, -1 on failure
554 my $result = &util::executeHDFSCommand('mkdir', $dir);
555 if ($result == 0)
556 {
557 $mkdir_ok = 1;
558 }
559 }
560 else
561 {
562 my $store_umask = umask(0002);
563 my $mkdir_ok = mkdir ($dir, 0777);
564 umask($store_umask);
565 }
566 # only output an error if this call wasn't marked as can_fail
567 if (!$mkdir_ok && (!defined $can_fail || !$can_fail))
568 {
569 print STDERR "util::mk_dir could not create directory: $dir\n error: $!\n";
570 }
571 return $mkdir_ok;
572}
573
574# in case anyone cares - I did some testing (using perls Benchmark module)
575# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
576# slightly faster (surprisingly) - Stefan.
577sub mk_all_dir
578{
579 my ($dir) = @_;
580
581 ###rint "-> util::mk_all_dir($dir)\n";
582
583 # support for HDFS
584 if (&util::isHDFS($dir))
585 {
586 # HDFS's version of mkdir does it recursively anyway
587 my $result = &util::executeHDFSCommand('mkdir', $dir);
588 return ($result == 0);
589 }
590
591 # use / for the directory separator, remove duplicate and
592 # trailing slashes
593 $dir=~s/[\\\/]+/\//g;
594 $dir=~s/[\\\/]+$//;
595
596 # ensure the directory doesn't already exist
597 if (-e $dir)
598 {
599 return 0;
600 }
601
602 # make sure the cache directory exists
603 my $dirsofar = "";
604 my $first = 1;
605 foreach my $dirname (split ("/", $dir))
606 {
607 $dirsofar .= "/" unless $first;
608 $first = 0;
609
610 $dirsofar .= $dirname;
611
612 next if $dirname =~ /^(|[a-z]:)$/i;
613 if (!-e $dirsofar)
614 {
615 my $store_umask = umask(0002);
616 my $mkdir_ok = mkdir ($dirsofar, 0777);
617 umask($store_umask);
618 if (!$mkdir_ok)
619 {
620 print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
621 return 0;
622 }
623 }
624 }
625 return (-e $dir);
626}
627
628# make hard link to file if supported by OS, otherwise copy the file
629sub hard_link {
630 my ($src, $dest, $verbosity) = @_;
631
632 print "&util::hard_link( $src, $dest, $verbosity)\n";
633
634 # remove trailing slashes from source and destination files
635 $src =~ s/[\\\/]+$//;
636 $dest =~ s/[\\\/]+$//;
637
638## print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
639 # a few sanity checks
640 if (&util::file_exists($dest)) {
641 # destination file already exists
642 return;
643 }
644 elsif (!&util::file_exists($src)) {
645 print STDERR "util::hard_link source file \"$src\" does not exist\n";
646 return 1;
647 }
648 elsif (!&util::file_exists($src) && &util::dir_exists($src)) {
649 print STDERR "util::hard_link source \"$src\" is a directory\n";
650 return 1;
651 }
652
653 my $dest_dir = &File::Basename::dirname($dest);
654 if (!&util::dir_exists($dest_dir))
655 {
656 mk_all_dir($dest_dir);
657 }
658
659 # HDFS Support - we can't ever link, copy instead
660 if (&util::isHDFS($src))
661 {
662 if (&util::isHDFS($dest))
663 {
664 &util::executeHDFSCommand('put', $src, $dest);
665 return 0;
666 }
667 else
668 {
669 &util::executeHDFSCommand('get', $src, $dest);
670 return 0;
671 }
672 }
673 elsif (&util::isHDFS($dest))
674 {
675 &util::executeHDFSCommand('put', $src, $dest);
676 return 0;
677 }
678
679 if (!link($src, $dest)) {
680 if ((!defined $verbosity) || ($verbosity>2)) {
681 print STDERR "util::hard_link: unable to create hard link. ";
682 print STDERR " Copying file: $src -> $dest\n";
683 }
684 &File::Copy::copy ($src, $dest);
685 }
686 return 0;
687}
688
689# make soft link to file if supported by OS, otherwise copy file
690sub soft_link {
691 my ($src, $dest, $ensure_paths_absolute) = @_;
692
693 # remove trailing slashes from source and destination files
694 $src =~ s/[\\\/]+$//;
695 $dest =~ s/[\\\/]+$//;
696
697 # Ensure file paths are absolute IF requested to do so
698 # Soft_linking didn't work for relative paths
699 if(defined $ensure_paths_absolute && $ensure_paths_absolute) {
700 # We need to ensure that the src file is the absolute path
701 # See http://perldoc.perl.org/File/Spec.html
702 if(!File::Spec->file_name_is_absolute( $src )) { # it's relative
703 $src = File::Spec->rel2abs($src); # make absolute
704 }
705 # Might as well ensure that the destination file's absolute path is used
706 if(!File::Spec->file_name_is_absolute( $dest )) {
707 $dest = File::Spec->rel2abs($dest); # make absolute
708 }
709 }
710
711 # a few sanity checks
712 if (!-e $src) {
713 print STDERR "util::soft_link source file $src does not exist\n";
714 return 0;
715 }
716
717 my $dest_dir = &File::Basename::dirname($dest);
718 mk_all_dir($dest_dir) if (!-e $dest_dir);
719
720 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
721
722 # symlink not supported on windows
723 &File::Copy::copy ($src, $dest);
724
725 } elsif (!eval {symlink($src, $dest)}) {
726 print STDERR "util::soft_link: unable to create soft link.\n";
727 return 0;
728 }
729
730 return 1;
731}
732
733# Primarily for filenames generated by processing
734# content of HTML files (which are mapped to UTF-8 internally)
735#
736# To turn this into an octet string that really exists on the file
737# system:
738# 1. don't need to do anything special for Unix-based systems
739# (as underlying file system is byte-code)
740# 2. need to map to short DOS filenames for Windows
741
742sub utf8_to_real_filename
743{
744 my ($utf8_filename) = @_;
745
746 my $real_filename;
747
748 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
749 require Win32;
750
751 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
752
753 my $unicode_filename = decode("utf8",$utf8_filename);
754 $real_filename = Win32::GetShortPathName($unicode_filename);
755 }
756 else {
757 $real_filename = $utf8_filename;
758 }
759
760 return $real_filename;
761}
762
763sub fd_exists
764{
765 my $filename_full_path = shift @_;
766 my $test_op = shift @_ || "-e";
767
768 # By default tests for existance of file or directory (-e)
769 # Can be made more specific by providing second parameter (e.g. -f or -d)
770
771 my $exists = 0;
772
773 # Support for HDFS [jmt12]
774 if (&util::isHDFS($filename_full_path))
775 {
776 # very limited test op support for HDFS
777 if ($test_op ne '-d' && $test_op ne '-e' && $test_op ne '-z')
778 {
779 $test_op = '-e';
780 }
781 my $result = &util::executeHDFSCommand('test ' . $test_op, $filename_full_path);
782 return ($result == 0);
783 }
784
785 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
786 require Win32;
787 my $filename_short_path = Win32::GetShortPathName($filename_full_path);
788 if (!defined $filename_short_path) {
789 # Was probably still in UTF8 form (not what is needed on Windows)
790 my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
791 if (defined $unicode_filename_full_path) {
792 $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
793 }
794 }
795 $filename_full_path = $filename_short_path;
796 }
797
798 if (defined $filename_full_path) {
799 $exists = eval "($test_op \$filename_full_path)";
800 }
801
802 return $exists;
803}
804
805sub file_exists
806{
807 my ($filename_full_path) = @_;
808# if ($filename_full_path =~ /^hdfs:/)
809# {
810# print "-> util::file_exists(" . $filename_full_path . ")\n";
811# }
812 return fd_exists($filename_full_path,"-f");
813}
814
815sub dir_exists
816{
817 my ($filename_full_path) = @_;
818# if ($filename_full_path =~ /^hdfs:/)
819# {
820# print "-> util::dir_exists(" . $filename_full_path . ")\n";
821# }
822 return fd_exists($filename_full_path,"-d");
823}
824
825
826
827# updates a copy of a directory in some other part of the filesystem
828# verbosity settings are: 0=low, 1=normal, 2=high
829# both $fromdir and $todir should be absolute paths
830sub cachedir {
831 my ($fromdir, $todir, $verbosity) = @_;
832 $verbosity = 1 unless defined $verbosity;
833
834 # use / for the directory separator, remove duplicate and
835 # trailing slashes
836 $fromdir=~s/[\\\/]+/\//g;
837 $fromdir=~s/[\\\/]+$//;
838 $todir=~s/[\\\/]+/\//g;
839 $todir=~s/[\\\/]+$//;
840
841 &mk_all_dir ($todir);
842
843 # get the directories in ascending order
844 if (!opendir (FROMDIR, $fromdir)) {
845 print STDERR "util::cachedir could not read directory $fromdir\n";
846 return;
847 }
848 my @fromdir = grep (!/^\.\.?$/, sort(readdir (FROMDIR)));
849 closedir (FROMDIR);
850
851 if (!opendir (TODIR, $todir)) {
852 print STDERR "util::cacedir could not read directory $todir\n";
853 return;
854 }
855 my @todir = grep (!/^\.\.?$/, sort(readdir (TODIR)));
856 closedir (TODIR);
857
858 my $fromi = 0;
859 my $toi = 0;
860
861 while ($fromi < scalar(@fromdir) || $toi < scalar(@todir)) {
862# print "fromi: $fromi toi: $toi\n";
863
864 # see if we should delete a file/directory
865 # this should happen if the file/directory
866 # is not in the from list or if its a different
867 # size, or has an older timestamp
868 if ($toi < scalar(@todir)) {
869 if (($fromi >= scalar(@fromdir)) ||
870 ($todir[$toi] lt $fromdir[$fromi] ||
871 ($todir[$toi] eq $fromdir[$fromi] &&
872 &differentfiles("$fromdir/$fromdir[$fromi]","$todir/$todir[$toi]",
873 $verbosity)))) {
874
875 # the files are different
876 &rm_r("$todir/$todir[$toi]");
877 splice(@todir, $toi, 1); # $toi stays the same
878
879 } elsif ($todir[$toi] eq $fromdir[$fromi]) {
880 # the files are the same
881 # if it is a directory, check its contents
882 if (-d "$todir/$todir[$toi]") {
883 &cachedir ("$fromdir/$fromdir[$fromi]",
884 "$todir/$todir[$toi]", $verbosity);
885 }
886
887 $toi++;
888 $fromi++;
889 next;
890 }
891 }
892
893 # see if we should insert a file/directory
894 # we should insert a file/directory if there
895 # is no tofiles left or if the tofile does not exist
896 if ($fromi < scalar(@fromdir) && ($toi >= scalar(@todir) ||
897 $todir[$toi] gt $fromdir[$fromi])) {
898 &cp_r ("$fromdir/$fromdir[$fromi]", "$todir/$fromdir[$fromi]");
899 splice (@todir, $toi, 0, $fromdir[$fromi]);
900
901 $toi++;
902 $fromi++;
903 }
904 }
905}
906
907# this function returns -1 if either file is not found
908# assumes that $file1 and $file2 are absolute file names or
909# in the current directory
910# $file2 is allowed to be newer than $file1
911sub differentfiles {
912 my ($file1, $file2, $verbosity) = @_;
913 $verbosity = 1 unless defined $verbosity;
914
915 $file1 =~ s/\/+$//;
916 $file2 =~ s/\/+$//;
917
918 my ($file1name) = $file1 =~ /\/([^\/]*)$/;
919 my ($file2name) = $file2 =~ /\/([^\/]*)$/;
920
921 return -1 unless (-e $file1 && -e $file2);
922 if ($file1name ne $file2name) {
923 print STDERR "filenames are not the same\n" if ($verbosity >= 2);
924 return 1;
925 }
926
927 my @file1stat = stat ($file1);
928 my @file2stat = stat ($file2);
929
930 if (-d $file1) {
931 if (! -d $file2) {
932 print STDERR "one file is a directory\n" if ($verbosity >= 2);
933 return 1;
934 }
935 return 0;
936 }
937
938 # both must be regular files
939 unless (-f $file1 && -f $file2) {
940 print STDERR "one file is not a regular file\n" if ($verbosity >= 2);
941 return 1;
942 }
943
944 # the size of the files must be the same
945 if ($file1stat[7] != $file2stat[7]) {
946 print STDERR "different sized files\n" if ($verbosity >= 2);
947 return 1;
948 }
949
950 # the second file cannot be older than the first
951 if ($file1stat[9] > $file2stat[9]) {
952 print STDERR "file is older\n" if ($verbosity >= 2);
953 return 1;
954 }
955
956 return 0;
957}
958
959
960sub get_tmp_filename
961{
962 my $file_ext = shift(@_) || undef;
963
964 my $opt_dot_file_ext = "";
965 if (defined $file_ext) {
966 if ($file_ext !~ m/\./) {
967 # no dot, so needs one added in at start
968 $opt_dot_file_ext = ".$file_ext"
969 }
970 else {
971 # allow for "extensions" such as _metadata.txt to be handled
972 # gracefully
973 $opt_dot_file_ext = $file_ext;
974 }
975 }
976
977 my $tmpdir = filename_cat($ENV{'GSDLHOME'}, "tmp");
978 &mk_all_dir ($tmpdir) unless -e $tmpdir;
979
980 my $count = 1000;
981 my $rand = int(rand $count);
982 my $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
983
984 while (-e $full_tmp_filename) {
985 $rand = int(rand $count);
986 $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
987 $count++;
988 }
989
990 return $full_tmp_filename;
991}
992
993sub get_timestamped_tmp_folder
994{
995
996 my $tmp_dirname;
997 if(defined $ENV{'GSDLCOLLECTDIR'}) {
998 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
999 } elsif(defined $ENV{'GSDLHOME'}) {
1000 $tmp_dirname = $ENV{'GSDLHOME'};
1001 } else {
1002 return undef;
1003 }
1004
1005 $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
1006 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
1007
1008 # add the timestamp into the path otherwise we can run into problems
1009 # if documents have the same name
1010 my $timestamp = time;
1011 my $time_tmp_dirname = &util::filename_cat($tmp_dirname, $timestamp);
1012 $tmp_dirname = $time_tmp_dirname;
1013 my $i = 1;
1014 while (-e $tmp_dirname) {
1015 $tmp_dirname = "$time_tmp_dirname$i";
1016 $i++;
1017 }
1018 &util::mk_dir($tmp_dirname);
1019
1020 return $tmp_dirname;
1021}
1022
1023sub get_timestamped_tmp_filename_in_collection
1024{
1025
1026 my ($input_filename, $output_ext) = @_;
1027 # derive tmp filename from input filename
1028 my ($tailname, $dirname, $suffix)
1029 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
1030
1031 # softlink to collection tmp dir
1032 my $tmp_dirname = &util::get_timestamped_tmp_folder();
1033 $tmp_dirname = $dirname unless defined $tmp_dirname;
1034
1035 # following two steps copied from ConvertBinaryFile
1036 # do we need them?? can't use them as is, as they use plugin methods.
1037
1038 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
1039
1040 # URLEncode this since htmls with images where the html filename is utf8 don't seem
1041 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
1042 # files on the filesystem.
1043 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
1044 if (defined $output_ext) {
1045 $output_ext = ".$output_ext"; # add the dot
1046 } else {
1047 $output_ext = $suffix;
1048 }
1049 $output_ext= lc($output_ext);
1050 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$output_ext");
1051
1052 return $tmp_filename;
1053}
1054
1055sub get_toplevel_tmp_dir
1056{
1057 return filename_cat($ENV{'GSDLHOME'}, "tmp");
1058}
1059
1060
1061sub filename_to_regex {
1062 my $filename = shift (@_);
1063
1064 # need to make single backslashes double so that regex works
1065 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
1066
1067 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
1068 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
1069 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
1070 $filename =~ s@\(@\\(@g; # escape brackets
1071 $filename =~ s@\)@\\)@g; # escape brackets
1072 $filename =~ s@\[@\\[@g; # escape brackets
1073 $filename =~ s@\]@\\]@g; # escape brackets
1074
1075 return $filename;
1076}
1077
1078sub unregex_filename {
1079 my $filename = shift (@_);
1080
1081 # need to put doubled backslashes for regex back to single
1082 $filename =~ s/\\\./\./g; # remove RE syntax for .
1083 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
1084 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
1085 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
1086 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
1087
1088 # \\ goes to \
1089 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
1090 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
1091 return $filename;
1092}
1093
1094sub filename_cat {
1095 my $first_file = shift(@_);
1096 my (@filenames) = @_;
1097
1098# Useful for debugging
1099# -- might make sense to call caller(0) rather than (1)??
1100# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1101# print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
1102
1103 # If first_file is not null or empty, then add it back into the list
1104 if (defined $first_file && $first_file =~ /\S/) {
1105 unshift(@filenames, $first_file);
1106 }
1107
1108 my $filename = join("/", @filenames);
1109
1110 # remove duplicate slashes and remove the last slash
1111 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1112 $filename =~ s/[\\\/]+/\\/g;
1113 } else {
1114 $filename =~ s/[\/]+/\//g;
1115 # DB: want a filename abc\de.html to remain like this
1116 }
1117 $filename =~ s/[\\\/]$//;
1118
1119 # restore protocols if present [jmt12]
1120 $filename =~ s/(file|hdfs|https?):\/([^\/])/$1:\/\/$2/g;
1121
1122 return $filename;
1123}
1124
1125
1126sub pathname_cat {
1127 my $first_path = shift(@_);
1128 my (@pathnames) = @_;
1129
1130 # If first_path is not null or empty, then add it back into the list
1131 if (defined $first_path && $first_path =~ /\S/) {
1132 unshift(@pathnames, $first_path);
1133 }
1134
1135 my $join_char;
1136 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1137 $join_char = ";";
1138 } else {
1139 $join_char = ":";
1140 }
1141
1142 my $pathname = join($join_char, @pathnames);
1143
1144 # remove duplicate slashes
1145 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1146 $pathname =~ s/[\\\/]+/\\/g;
1147 } else {
1148 $pathname =~ s/[\/]+/\//g;
1149 # DB: want a pathname abc\de.html to remain like this
1150 }
1151
1152 return $pathname;
1153}
1154
1155
1156sub tidy_up_oid {
1157 my ($OID) = @_;
1158 if ($OID =~ /\./) {
1159 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
1160 $OID =~ s/\.//g; #remove any periods
1161 }
1162 if ($OID =~ /^\s.*\s$/) {
1163 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
1164 # remove starting and trailing whitespace
1165 $OID =~ s/^\s+//;
1166 $OID =~ s/\s+$//;
1167 }
1168 if ($OID =~ /^[\d]*$/) {
1169 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
1170 $OID = "D" . $OID;
1171 }
1172
1173 return $OID;
1174}
1175sub envvar_prepend {
1176 my ($var,$val) = @_;
1177
1178 # do not prepend any value/path that's already in the environment variable
1179
1180 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1181 if (!defined($ENV{$var})) {
1182 $ENV{$var} = "$val";
1183 }
1184 elsif($ENV{$var} !~ m/$escaped_val/) {
1185 $ENV{$var} = "$val;".$ENV{$var};
1186 }
1187}
1188
1189sub envvar_append {
1190 my ($var,$val) = @_;
1191
1192 # do not append any value/path that's already in the environment variable
1193
1194 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1195 if (!defined($ENV{$var})) {
1196 $ENV{$var} = "$val";
1197 }
1198 elsif($ENV{$var} !~ m/$escaped_val/) {
1199 $ENV{$var} .= ";$val";
1200 }
1201}
1202
1203
1204# splits a filename into a prefix and a tail extension using the tail_re, or
1205# if that fails, splits on the file_extension . (dot)
1206sub get_prefix_and_tail_by_regex {
1207
1208 my ($filename,$tail_re) = @_;
1209
1210 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
1211 if ((!defined $file_prefix) || (!defined $file_ext)) {
1212 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
1213 }
1214
1215 return ($file_prefix,$file_ext);
1216}
1217
1218# get full path and file only path from a base_dir (which may be empty) and
1219# file (which may contain directories)
1220sub get_full_filenames {
1221 my ($base_dir, $file) = @_;
1222
1223 my $filename_full_path = $file;
1224 # add on directory if present
1225 $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\S/;
1226
1227 my $filename_no_path = $file;
1228
1229 # remove directory if present
1230 $filename_no_path =~ s/^.*[\/\\]//;
1231 return ($filename_full_path, $filename_no_path);
1232}
1233
1234# returns the path of a file without the filename -- ie. the directory the file is in
1235sub filename_head {
1236 my $filename = shift(@_);
1237
1238 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1239 $filename =~ s/[^\\\\]*$//;
1240 }
1241 else {
1242 $filename =~ s/[^\\\/]*$//;
1243 }
1244
1245 return $filename;
1246}
1247
1248
1249
1250# returns 1 if filename1 and filename2 point to the same
1251# file or directory
1252sub filenames_equal {
1253 my ($filename1, $filename2) = @_;
1254
1255 # use filename_cat to clean up trailing slashes and
1256 # multiple slashes
1257 $filename1 = filename_cat ($filename1);
1258 $filename2 = filename_cat ($filename2);
1259
1260 # filenames not case sensitive on windows
1261 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1262 $filename1 =~ tr/[A-Z]/[a-z]/;
1263 $filename2 =~ tr/[A-Z]/[a-z]/;
1264 }
1265 return 1 if $filename1 eq $filename2;
1266 return 0;
1267}
1268
1269# If filename is relative to within_dir, returns the relative path of filename to that directory
1270# with slashes in the filename returned as they were in the original (absolute) filename.
1271sub filename_within_directory
1272{
1273 my ($filename,$within_dir) = @_;
1274
1275 if ($within_dir !~ m/[\/\\]$/) {
1276 my $dirsep = &util::get_dirsep();
1277 $within_dir .= $dirsep;
1278 }
1279
1280 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
1281 if ($filename =~ m/^$within_dir(.*)$/) {
1282 $filename = $1;
1283 }
1284
1285 return $filename;
1286}
1287
1288# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
1289# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
1290# The subpath returned will also be a URL type filename.
1291sub filename_within_directory_url_format
1292{
1293 my ($filename,$within_dir) = @_;
1294
1295 # convert parameters only to / slashes if Windows
1296
1297 my $filename_urlformat = &filepath_to_url_format($filename);
1298 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
1299
1300 #if ($within_dir_urlformat !~ m/\/$/) {
1301 # make sure directory ends with a slash
1302 #$within_dir_urlformat .= "/";
1303 #}
1304
1305 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
1306
1307 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
1308
1309 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
1310 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
1311 $filename_urlformat = $1;
1312 }
1313
1314 return $filename_urlformat;
1315}
1316
1317# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
1318# since on Linux it doesn't represent a file separator but an escape char).
1319sub filepath_to_url_format
1320{
1321 my ($filepath) = @_;
1322 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1323 # Only need to worry about Windows, as Unix style directories already in url-format
1324 # Convert Windows style \ => /
1325 $filepath =~ s@\\@/@g;
1326 }
1327 return $filepath;
1328}
1329
1330# regex filepaths on windows may include \\ as path separator. Convert \\ to /
1331sub filepath_regex_to_url_format
1332{
1333 my ($filepath) = @_;
1334 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1335 # Only need to worry about Windows, as Unix style directories already in url-format
1336 # Convert Windows style \\ => /
1337 $filepath =~ s@\\\\@/@g;
1338 }
1339 return $filepath;
1340
1341}
1342
1343# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
1344# and ignores trailing /
1345# returns (file, dirs) dirs will be empty if no subdirs
1346sub url_fileparse
1347{
1348 my ($filepath) = @_;
1349 # remove trailing /
1350 $filepath =~ s@/$@@;
1351 if ($filepath !~ m@/@) {
1352 return ($filepath, "");
1353 }
1354 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
1355 return ($file, $dirs);
1356
1357}
1358
1359
1360sub filename_within_collection
1361{
1362 my ($filename) = @_;
1363
1364 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
1365
1366 if (defined $collect_dir) {
1367
1368 # if from within GSDLCOLLECTDIR, then remove directory prefix
1369 # so source_filename is realative to it. This is done to aid
1370 # portability, i.e. the collection can be moved to somewhere
1371 # else on the file system and the archives directory will still
1372 # work. This is needed, for example in the applet version of
1373 # GLI where GSDLHOME/collect on the server will be different to
1374 # the collect directory of the remove user. Of course,
1375 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
1376 # it back into a full pathname.
1377
1378 $filename = filename_within_directory($filename,$collect_dir);
1379 }
1380
1381 return $filename;
1382}
1383
1384sub prettyprint_file
1385{
1386 my ($base_dir,$file,$gli) = @_;
1387
1388 my $filename_full_path = &util::filename_cat($base_dir,$file);
1389
1390 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1391 require Win32;
1392
1393 # For some reason base_dir in the form c:/a/b/c
1394 # This leads to confusion later on, so turn it back into
1395 # the more usual Windows form
1396 $base_dir =~ s/\//\\/g;
1397 my $long_base_dir = Win32::GetLongPathName($base_dir);
1398 my $long_full_path = Win32::GetLongPathName($filename_full_path);
1399
1400 $file = filename_within_directory($long_full_path,$long_base_dir);
1401 $file = encode("utf8",$file) if ($gli);
1402 }
1403
1404 return $file;
1405}
1406
1407
1408sub upgrade_if_dos_filename
1409{
1410 my ($filename_full_path,$and_encode) = @_;
1411
1412 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1413 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
1414 # to its long (Windows) version
1415 my $long_filename = Win32::GetLongPathName($filename_full_path);
1416 if (defined $long_filename) {
1417 $filename_full_path = $long_filename;
1418 }
1419 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
1420 $filename_full_path =~ s/^(.):/\u$1:/;
1421 if ((defined $and_encode) && ($and_encode)) {
1422 $filename_full_path = encode("utf8",$filename_full_path);
1423 }
1424 }
1425
1426 return $filename_full_path;
1427}
1428
1429
1430sub downgrade_if_dos_filename
1431{
1432 my ($filename_full_path) = @_;
1433
1434 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1435 require Win32;
1436
1437 # Ensure the given long Windows filename is in a form that can
1438 # be opened by Perl => convert it to a short DOS-like filename
1439
1440 my $short_filename = Win32::GetShortPathName($filename_full_path);
1441 if (defined $short_filename) {
1442 $filename_full_path = $short_filename;
1443 }
1444 # Make sure initial drive letter is lower-case (to fit in
1445 # with rest of Greenstone)
1446 $filename_full_path =~ s/^(.):/\u$1:/;
1447 }
1448
1449 return $filename_full_path;
1450}
1451
1452sub block_filename
1453{
1454 my ($block_hash,$filename) = @_;
1455
1456 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
1457
1458 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
1459 my $lower_filename = lc($filename);
1460 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
1461# my $lower_drive = $filename;
1462# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
1463
1464# my $upper_drive = $filename;
1465# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
1466#
1467# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
1468# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
1469 }
1470 else {
1471 $block_hash->{'file_blocks'}->{$filename} = 1;
1472 }
1473}
1474
1475
1476sub filename_is_absolute
1477{
1478 my ($filename) = @_;
1479
1480 # support for explicit protocol prefixes, for example file: or hdfs:,
1481 # which must be absolute [jmt12]
1482 if ($filename =~ /^\w+:\/\//)
1483 {
1484 return 1;
1485 }
1486
1487 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1488 return ($filename =~ m/^(\w:)?\\/);
1489 }
1490 else {
1491 return ($filename =~ m/^\//);
1492 }
1493}
1494
1495
1496## @method make_absolute()
1497#
1498# Ensure the given file path is absolute in respect to the given base path.
1499#
1500# @param $base_dir A string denoting the base path the given dir must be
1501# absolute to.
1502# @param $dir The directory to be made absolute as a string. Note that the
1503# dir may already be absolute, in which case it will remain
1504# unchanged.
1505# @return The now absolute form of the directory as a string.
1506#
1507# @author John Thompson, DL Consulting Ltd.
1508# @copy 2006 DL Consulting Ltd.
1509#
1510#used in buildcol.pl, doesn't work for all cases --kjdon
1511sub make_absolute {
1512
1513 my ($base_dir, $dir) = @_;
1514### print STDERR "dir = $dir\n";
1515 $dir =~ s/[\\\/]+/\//g;
1516 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
1517 $dir =~ s|^/tmp_mnt||;
1518 1 while($dir =~ s|/[^/]*/\.\./|/|g);
1519 $dir =~ s|/[.][.]?/|/|g;
1520 $dir =~ tr|/|/|s;
1521### print STDERR "dir = $dir\n";
1522
1523 return $dir;
1524}
1525## make_absolute() ##
1526
1527sub get_dirsep {
1528
1529 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1530 return "\\";
1531 } else {
1532 return "\/";
1533 }
1534}
1535
1536sub get_os_dirsep {
1537
1538 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1539 return "\\\\";
1540 } else {
1541 return "\\\/";
1542 }
1543}
1544
1545sub get_re_dirsep {
1546
1547 return "\\\\|\\\/";
1548}
1549
1550
1551sub get_dirsep_tail {
1552 my ($filename) = @_;
1553
1554 # returns last part of directory or filename
1555 # On unix e.g. a/b.d => b.d
1556 # a/b/c => c
1557
1558 my $dirsep = get_re_dirsep();
1559 my @dirs = split (/$dirsep/, $filename);
1560 my $tail = pop @dirs;
1561
1562 # - caused problems under windows
1563 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1564
1565 return $tail;
1566}
1567
1568
1569# if this is running on windows we want binaries to end in
1570# .exe, otherwise they don't have to end in any extension
1571sub get_os_exe {
1572 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
1573 return "";
1574}
1575
1576
1577# test to see whether this is a big or little endian machine
1578sub is_little_endian
1579{
1580 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1581 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1582 # Otherwise, it's little endian
1583
1584 #return 0 if $^O =~ /^darwin$/i;
1585 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1586
1587 # Going back to stating exactly whether the machine is little endian
1588 # or big endian, without any special case for Macs. Since for rata it comes
1589 # back with little endian and for shuttle with bigendian.
1590 return (ord(substr(pack("s",1), 0, 1)) == 1);
1591}
1592
1593
1594# will return the collection name if successful, "" otherwise
1595sub use_collection {
1596 my ($collection, $collectdir) = @_;
1597
1598 if (!defined $collectdir || $collectdir eq "") {
1599 $collectdir = &filename_cat ($ENV{'GSDLHOME'}, "collect");
1600 }
1601
1602 # get and check the collection
1603 if (!defined($collection) || $collection eq "") {
1604 if (defined $ENV{'GSDLCOLLECTION'}) {
1605 $collection = $ENV{'GSDLCOLLECTION'};
1606 } else {
1607 print STDOUT "No collection specified\n";
1608 return "";
1609 }
1610 }
1611
1612 if ($collection eq "modelcol") {
1613 print STDOUT "You can't use modelcol.\n";
1614 return "";
1615 }
1616
1617 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1618 # are defined
1619 $ENV{'GSDLCOLLECTION'} = $collection;
1620 $ENV{'GSDLCOLLECTDIR'} = &filename_cat ($collectdir, $collection);
1621
1622 # make sure this collection exists
1623 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1624 print STDOUT "Invalid collection ($collection).\n";
1625 return "";
1626 }
1627
1628 # everything is ready to go
1629 return $collection;
1630}
1631
1632sub get_current_collection_name {
1633 return $ENV{'GSDLCOLLECTION'};
1634}
1635
1636
1637# will return the collection name if successful, "" otherwise.
1638# Like use_collection (above) but for greenstone 3 (taking account of site level)
1639
1640sub use_site_collection {
1641 my ($site, $collection, $collectdir) = @_;
1642
1643 if (!defined $collectdir || $collectdir eq "") {
1644 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1645 $collectdir = &filename_cat ($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1646 }
1647
1648 # collectdir explicitly set by this point (using $site variable if required).
1649 # Can call "old" gsdl2 use_collection now.
1650
1651 return use_collection($collection,$collectdir);
1652}
1653
1654
1655
1656sub locate_config_file
1657{
1658 my ($file) = @_;
1659
1660 my $locations = locate_config_files($file);
1661
1662 return shift @$locations; # returns undef if 'locations' is empty
1663}
1664
1665
1666sub locate_config_files
1667{
1668 my ($file) = @_;
1669
1670 my @locations = ();
1671
1672 if (-e $file) {
1673 # Clearly specified (most likely full filename)
1674 # No need to hunt in 'etc' directories, return value unchanged
1675 push(@locations,$file);
1676 }
1677 else {
1678 # Check for collection specific one before looking in global GSDL 'etc'
1679 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1680 my $test_collect_etc_filename
1681 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1682
1683 if (-e $test_collect_etc_filename) {
1684 push(@locations,$test_collect_etc_filename);
1685 }
1686 }
1687 my $test_main_etc_filename
1688 = &util::filename_cat($ENV{'GSDLHOME'},"etc", $file);
1689 if (-e $test_main_etc_filename) {
1690 push(@locations,$test_main_etc_filename);
1691 }
1692 }
1693
1694 return \@locations;
1695}
1696
1697
1698sub hyperlink_text
1699{
1700 my ($text) = @_;
1701
1702 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1703 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1704
1705 return $text;
1706}
1707
1708
1709# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1710# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1711sub is_dir_empty
1712{
1713 my ($path) = @_;
1714 opendir DIR, $path;
1715 while(my $entry = readdir DIR) {
1716 next if($entry =~ /^\.\.?$/);
1717 closedir DIR;
1718 return 0;
1719 }
1720 closedir DIR;
1721 return 1;
1722}
1723
1724# Returns the given filename converted using either URL encoding or base64
1725# encoding, as specified by $rename_method. If the given filename has no suffix
1726# (if it is just the tailname), then $no_suffix should be some defined value.
1727# rename_method can be url, none, base64
1728sub rename_file {
1729 my ($filename, $rename_method, $no_suffix) = @_;
1730
1731 if(!$filename) { # undefined or empty string
1732 return $filename;
1733 }
1734
1735 if (!$rename_method) {
1736 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1737 # Debugging information
1738 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1739 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1740 $rename_method = "url";
1741 } elsif($rename_method eq "none") {
1742 return $filename; # would have already been renamed
1743 }
1744
1745 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1746 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1747 ###$filename =~ s/ /_/g;
1748
1749 my ($tailname,$dirname,$suffix);
1750 if($no_suffix) { # given a tailname, no suffix
1751 ($tailname,$dirname) = File::Basename::fileparse($filename);
1752 }
1753 else {
1754 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1755 }
1756 if (!$suffix) {
1757 $suffix = "";
1758 }
1759 else {
1760 $suffix = lc($suffix);
1761 }
1762
1763 if ($rename_method eq "url") {
1764 $tailname = &unicode::url_encode($tailname);
1765 }
1766 elsif ($rename_method eq "base64") {
1767 $tailname = &unicode::base64_encode($tailname);
1768 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1769 }
1770
1771 $filename = "$tailname$suffix";
1772 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1773
1774 return $filename;
1775}
1776
1777
1778# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1779sub rename_ldb_or_bdb_file {
1780 my ($filename_no_ext) = @_;
1781
1782 my $new_filename = "$filename_no_ext.gdb";
1783 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1784 # try ldb
1785 my $old_filename = "$filename_no_ext.ldb";
1786
1787 if (-f $old_filename) {
1788 print STDERR "Renaming $old_filename to $new_filename\n";
1789 rename ($old_filename, $new_filename)
1790 || print STDERR "Rename failed: $!\n";
1791 return;
1792 }
1793 # try bdb
1794 $old_filename = "$filename_no_ext.bdb";
1795 if (-f $old_filename) {
1796 print STDERR "Renaming $old_filename to $new_filename\n";
1797 rename ($old_filename, $new_filename)
1798 || print STDERR "Rename failed: $!\n";
1799 return;
1800 }
1801}
1802
1803sub os_dir() {
1804
1805 my $gsdlarch = "";
1806 if(defined $ENV{'GSDLARCH'}) {
1807 $gsdlarch = $ENV{'GSDLARCH'};
1808 }
1809 return $ENV{'GSDLOS'}.$gsdlarch;
1810}
1811
1812# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1813# By default, /greenstone3 for GS3 or /greenstone for GS2.
1814sub get_greenstone_url_prefix() {
1815 # if already set on a previous occasion, just return that
1816 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1817 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1818
1819 my ($configfile, $urlprefix, $defaultUrlprefix);
1820 my @propertynames = ();
1821
1822 if($ENV{'GSDL3SRCHOME'}) {
1823 $defaultUrlprefix = "/greenstone3";
1824 $configfile = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1825 push(@propertynames, qw/path\s*\=/);
1826 } else {
1827 $defaultUrlprefix = "/greenstone";
1828 $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1829 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1830 }
1831
1832 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1833
1834 if(!$urlprefix) { # no values found for URL prefix, use default values
1835 $urlprefix = $defaultUrlprefix;
1836 } else {
1837 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1838 $urlprefix =~ s/^\///; # remove the starting slash
1839 my @dirs = split(/(\\|\/)/, $urlprefix);
1840 $urlprefix = shift(@dirs);
1841
1842 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1843 $urlprefix = "/$urlprefix";
1844 }
1845 }
1846
1847 # set for the future
1848 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1849# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1850 return $urlprefix;
1851}
1852
1853
1854# Given a config file (xml or java properties file) and a list/array of regular expressions
1855# that represent property names to match on, this function will return the value for the 1st
1856# matching property name. If the return value is undefined, no matching property was found.
1857sub extract_propvalue_from_file() {
1858 my ($configfile, $propertynames) = @_;
1859
1860 my $value;
1861 unless(open(FIN, "<$configfile")) {
1862 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1863 return $value; # not initialised
1864 }
1865
1866 # Read the entire file at once, as one single line, then close it
1867 my $filecontents;
1868 {
1869 local $/ = undef;
1870 $filecontents = <FIN>;
1871 }
1872 close(FIN);
1873
1874 foreach my $regex (@$propertynames) {
1875 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1876 if($value) {
1877 $value =~ s/^\"//; # remove any startquotes
1878 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1879 last; # found value for a matching property, break from loop
1880 }
1881 }
1882
1883 return $value;
1884}
1885
1886# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1887# given that perllib is in @INC in order to invoke this subroutine.
1888# Call as follows -- after setting up INC to include perllib and
1889# after setting up GSDLHOME and GSDLOS:
1890#
1891# require util;
1892# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1893#
1894sub setup_greenstone_env() {
1895 my ($GSDLHOME, $GSDLOS) = @_;
1896
1897 #my %env_map = ();
1898 # Get the localised ENV settings of running a localised source setup.bash
1899 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1900 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1901 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1902 if($GSDLOS =~ m/windows/i) {
1903 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1904 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1905 }
1906 if (!open(PIN, "$perl_command |")) {
1907 print STDERR ("Unable to execute command: $perl_command. $!\n");
1908 }
1909
1910 while (defined (my $perl_output_line = <PIN>)) {
1911 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1912 #$env_map{$key}=$value;
1913 $ENV{$key}=$value;
1914 }
1915 close (PIN);
1916
1917 # If any keys in $ENV don't occur in Greenstone's localised env
1918 # (stored in $env_map), delete those entries from $ENV
1919 #foreach $key (keys %ENV) {
1920 # if(!defined $env_map{$key}) {
1921 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1922 # delete $ENV{$key}; # del $ENV(key, value) pair
1923 # }
1924 #}
1925 #undef %env_map;
1926}
1927
1928sub get_perl_exec() {
1929 my $perl_exec = $^X; # may return just "perl"
1930
1931 if($ENV{'PERLPATH'}) {
1932 # OR: # $perl_exec = &util::filename_cat($ENV{'PERLPATH'},"perl");
1933 if($ENV{'GSDLOS'} =~ m/windows/) {
1934 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1935 } else {
1936 $perl_exec = "$ENV{'PERLPATH'}/perl";
1937 }
1938 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1939 # containing the full path to the current perl executable we're using
1940 $perl_exec = $Config{perlpath}; # configured path for perl
1941 if (!-e $perl_exec) { # may not point to location on this machine
1942 $perl_exec = $^X; # may return just "perl"
1943 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1944 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1945 }
1946 }
1947 }
1948
1949 return $perl_exec;
1950}
1951
1952# returns the path to the java command in the JRE included with GS (if any),
1953# quoted to safeguard any spaces in this path, otherwise a simple java
1954# command is returned which assumes and will try for a system java.
1955sub get_java_command {
1956 my $java = "java";
1957 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1958 # after running setup.bat or from GLI which also runs setup.bat
1959 my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
1960 if(-d $java_bin) {
1961 $java = &util::filename_cat($java_bin,"java");
1962 $java = "\"".$java."\""; # quoted to preserve spaces in path
1963 }
1964 }
1965 return $java;
1966}
1967
1968
1969# Given the qualified collection name (colgroup/collection),
1970# returns the collection and colgroup parts
1971sub get_collection_parts {
1972 # http://perldoc.perl.org/File/Basename.html
1973 # my($filename, $directories, $suffix) = fileparse($path);
1974 # "$directories contains everything up to and including the last directory separator in the $path
1975 # including the volume (if applicable). The remainder of the $path is the $filename."
1976 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1977
1978 my $qualified_collection = shift(@_);
1979
1980 # Since activate.pl can be launched from the command-line, including by a user,
1981 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1982 # Also allow for the accidental inclusion of multiple slashes
1983 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1984
1985 if(!defined $collection) {
1986 $collection = $colgroup;
1987 $colgroup = "";
1988 }
1989 return ($collection, $colgroup);
1990}
1991
1992# work out the "collectdir/collection" location
1993sub resolve_collection_dir {
1994 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1995
1996 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1997
1998 if (defined $collect_dir) {
1999 return &util::filename_cat($collect_dir,$colgroup, $collection);
2000 }
2001 else {
2002 if (defined $site) {
2003 return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
2004 }
2005 else {
2006 return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
2007 }
2008 }
2009}
2010
2011# ====== John's new fuctions to support HDFS ======
2012
2013# /**
2014# * Executes a HDFS command without caring about the resulting output while
2015# * still reacting appropriately to failed executions.
2016# */
2017sub executeHDFSCommand
2018{
2019 my $action = shift(@_);
2020 my $command = 'hadoop fs -' . $action . ' "' . join('" "', @_) . '"';
2021 my $result = `$command 2>&1`;
2022 my $return_value = $?;
2023 ###rint STDERR "-> util::executeHDFSCommand('" . $command . "') => " . $return_value . "\n";
2024 return $return_value;
2025}
2026
2027# /** @function file_canread()
2028# */
2029sub file_canread
2030{
2031 my ($filename_full_path) = @_;
2032 # the HDFS support doesn't have '-r' so it will revert to '-e'
2033 return fd_exists($filename_full_path,"-r");
2034}
2035# /** file_canread() **/
2036
2037sub file_openfdcommand
2038{
2039 my ($filename_full_path, $mode) = @_;
2040 ##rint STDERR "-> util::file_openfdcommand('" . $filename_full_path . "', '" . $mode . "')\n";
2041 # I'll set to read by default, as that is less destructive to precious files
2042 # on your system...
2043 if (!defined $mode)
2044 {
2045 $mode = '<';
2046 }
2047 my $open_fd_command = $mode . $filename_full_path;
2048 if (&util::isHDFS($filename_full_path))
2049 {
2050 # currently don't really support append, but might be able to do something
2051 # like:
2052 # hadoop fs -cat /user/username/folder/csv1.csv \
2053 # /user/username/folder/csv2.csv | hadoop fs -put - \
2054 # /user/username/folder/output.csv
2055 if ($mode eq '>>' || $mode eq '>')
2056 {
2057 # if the file already exists, put won't clobber it like a proper write
2058 # would, so try to delete it (can fail)
2059 if (&util::file_exists($filename_full_path))
2060 {
2061 &util::rm($filename_full_path);
2062 }
2063 # then create the command
2064 $open_fd_command = '| ' . &util::generateHDFSCommand('put', '-', $filename_full_path);
2065 }
2066 else
2067 {
2068 $open_fd_command = &util::generateHDFSCommand('cat', $filename_full_path) . ' |';
2069 }
2070 }
2071 return $open_fd_command;
2072}
2073
2074# /** @function file_readdir()
2075# * Provide a function to return the files within a directory that is aware
2076# * of protocols other than file://
2077# * @param $dirname the full path to the directory
2078# * @param $dir_ref a reference to an array to populate with files
2079# */
2080sub file_readdir
2081{
2082 my ($dirname, $dir_ref) = @_;
2083 my $dir_read = 0;
2084 if (&util::isHDFS($dirname))
2085 {
2086 my $hdfs_command = &util::generateHDFSCommand('ls', $dirname);
2087 my $result = `$hdfs_command 2>&1`;
2088 my @lines = split(/\r?\n/, $result);
2089 foreach my $line (@lines)
2090 {
2091 if ($line =~ /\/([^\/]+)$/)
2092 {
2093 my $file = $1;
2094 push(@{$dir_ref}, $file);
2095 }
2096 }
2097 $dir_read = 1;
2098 }
2099 elsif (opendir(DIR, $dirname))
2100 {
2101 my @dirs = readdir(DIR);
2102 push(@{$dir_ref}, @dirs);
2103 closedir(DIR);
2104 $dir_read = 1;
2105 }
2106 return $dir_read;
2107}
2108# /** file_readdir() **/
2109
2110# /**
2111# */
2112sub file_lastmodified
2113{
2114 my ($filename_full_path) = @_;
2115 my $last_modified = 0;
2116 if (&util::isHDFS($filename_full_path))
2117 {
2118 my $file_stats = file_stats($filename_full_path);
2119 my $mod_date = $file_stats->{'modification_date'};
2120 my $mod_time = $file_stats->{'modification_time'};
2121 # Last modified should be in number of days (as a float) since last
2122 # modified - but I'll just return 0 for now
2123 $last_modified = 0.00;
2124 }
2125 else
2126 {
2127 $last_modified = -M $filename_full_path;
2128 }
2129 return $last_modified;
2130}
2131# /** file_lastmodified() **/
2132
2133# /** @function file_size
2134# * Replacement for "-s" in Greenstone buildtime, as we need a version that
2135# * HDFS aware
2136# */
2137sub file_size
2138{
2139 my ($filename_full_path) = @_;
2140 my $size = 0;
2141 if (&util::isHDFS($filename_full_path))
2142 {
2143 my $file_stats = file_stats($filename_full_path);
2144 $size = $file_stats->{'filesize'};
2145 }
2146 else
2147 {
2148 $size = -s $filename_full_path;
2149 }
2150 return $size;
2151}
2152# /** file_size() **/
2153
2154sub file_stats
2155{
2156 my ($filename_full_path) = @_;
2157 my $stats = {};
2158 if (&util::isHDFS($filename_full_path))
2159 {
2160 # - LS is the only way to get these details from HDFS (-stat doesn't
2161 # provide enough information)
2162 my $hdfs_command = &util::generateHDFSCommand('ls', $filename_full_path);
2163 my $result = `$hdfs_command 2>&1`;
2164 # - parse the results
2165 if ($result =~ /([ds\-][rwx\-]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)\s+(\d+)\s+(\d\d\d\d-\d\d-\d\d)\s+(\d\d:\d\d)\s+([^\s]+)$/)
2166 {
2167 $stats->{'filename'} = $8;
2168 $stats->{'replicas'} = $2;
2169 $stats->{'filesize'} = $5;
2170 $stats->{'modification_date'} = $6;
2171 $stats->{'modification_time'} = $7;
2172 $stats->{'permissions'} = $1;
2173 $stats->{'userid'} = $3;
2174 $stats->{'groupid'} = $4;
2175 }
2176 else
2177 {
2178 die("Error! Failed to parse HDFS ls result: " . $result . "\n");
2179 }
2180 }
2181 return $stats;
2182}
2183# /** file_stats() **/
2184
2185# /**
2186# */
2187sub generateHDFSCommand
2188{
2189 my $flags = shift(@_);
2190 return 'hadoop fs -' . $flags . ' "' . join('" "', @_) . '"';
2191}
2192# /** generateHDFSCommand() **/
2193
2194# /** @function isHDFS()
2195# * Determine if the given path exists within a HDFS system by checking for
2196# * the expected protocol prefix.
2197# * @param $full_path the path to check
2198# * @return 1 if within HDFS, 0 otherwise
2199# */
2200sub isHDFS
2201{
2202 my ($full_path) = @_;
2203 return (lc(substr($full_path, 0, 7)) eq 'hdfs://');
2204}
2205# /** isHDFS() **/
2206
22071;
Note: See TracBrowser for help on using the repository browser.