source: main/trunk/greenstone2/perllib/util.pm@ 26973

Last change on this file since 26973 was 26973, checked in by kjdon, 11 years ago

don't lowercase the suffix in rename_file - if the original was uppercase, eg .JPG then it won't match .jpg when GLI reads through archive files to find metadata. David can't remember why he did it in the first place. I think there must have been a reason, so maybe it will show up again one day...

  • Property svn:keywords set to Author Date Id Revision
File size: 54.9 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29
30use Encode;
31use File::Copy;
32use File::Basename;
33# Config for getting the perlpath in the recommended way, though it uses paths that are
34# hard-coded into the Config file that's generated upon configuring and compiling perl.
35# $^X works better in some cases to return the path to perl used to launch the script,
36# but if launched with plain "perl" (no full-path), that will be just what it returns.
37use Config;
38
39# removes files (but not directories)
40sub rm {
41 my (@files) = @_;
42
43 my @filefiles = ();
44
45 # make sure the files we want to delete exist
46 # and are regular files
47 foreach my $file (@files) {
48 if (!-e $file) {
49 print STDERR "util::rm $file does not exist\n";
50 } elsif ((!-f $file) && (!-l $file)) {
51 print STDERR "util::rm $file is not a regular (or symbolic) file\n";
52 } else {
53 push (@filefiles, $file);
54 }
55 }
56
57 # remove the files
58 my $numremoved = unlink @filefiles;
59
60 # check to make sure all of them were removed
61 if ($numremoved != scalar(@filefiles)) {
62 print STDERR "util::rm Not all files were removed\n";
63 }
64}
65
66# removes files (but not directories) - can rename this to the default
67# "rm" subroutine when debugging the deletion of individual files.
68sub rm_debug {
69 my (@files) = @_;
70 my @filefiles = ();
71
72 # make sure the files we want to delete exist
73 # and are regular files
74 foreach my $file (@files) {
75 if (!-e $file) {
76 print STDERR "util::rm $file does not exist\n";
77 } elsif ((!-f $file) && (!-l $file)) {
78 print STDERR "util::rm $file is not a regular (or symbolic) file\n";
79 } else { # debug message
80 unlink($file) or warn "Could not delete file $file: $!\n";
81 }
82 }
83}
84
85
86# recursive removal
87sub filtered_rm_r {
88 my ($files,$file_accept_re,$file_reject_re) = @_;
89
90# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(2);
91# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
92# print STDERR "** Calling method (2): $lcfilename:$cline $cpackage->$csubr\n";
93
94 my @files_array = (ref $files eq "ARRAY") ? @$files : ($files);
95
96 # recursively remove the files
97 foreach my $file (@files_array) {
98 $file =~ s/[\/\\]+$//; # remove trailing slashes
99
100 if (!-e $file) {
101 print STDERR "util::filtered_rm_r $file does not exist\n";
102
103 } elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link
104 # get the contents of this directory
105 if (!opendir (INDIR, $file)) {
106 print STDERR "util::filtered_rm_r could not open directory $file\n";
107 } else {
108 my @filedir = grep (!/^\.\.?$/, readdir (INDIR));
109 closedir (INDIR);
110
111 # remove all the files in this directory
112 map {$_="$file/$_";} @filedir;
113 &filtered_rm_r (\@filedir,$file_accept_re,$file_reject_re);
114
115 if (!defined $file_accept_re && !defined $file_reject_re) {
116 # remove this directory
117 if (!rmdir $file) {
118 print STDERR "util::filtered_rm_r couldn't remove directory $file\n";
119 }
120 }
121 }
122 } else {
123 next if (defined $file_reject_re && ($file =~ m/$file_reject_re/));
124
125 if ((!defined $file_accept_re) || ($file =~ m/$file_accept_re/)) {
126 # remove this file
127 &rm ($file);
128 }
129 }
130 }
131}
132
133
134# recursive removal
135sub rm_r {
136 my (@files) = @_;
137
138 # use the more general (but reterospectively written function
139 # filtered_rm_r function()
140
141 filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
142}
143
144
145
146
147# moves a file or a group of files
148sub mv {
149 my $dest = pop (@_);
150 my (@srcfiles) = @_;
151
152 # remove trailing slashes from source and destination files
153 $dest =~ s/[\\\/]+$//;
154 map {$_ =~ s/[\\\/]+$//;} @srcfiles;
155
156 # a few sanity checks
157 if (scalar (@srcfiles) == 0) {
158 print STDERR "util::mv no destination directory given\n";
159 return;
160 } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
161 print STDERR "util::mv if multiple source files are given the ".
162 "destination must be a directory\n";
163 return;
164 }
165
166 # move the files
167 foreach my $file (@srcfiles) {
168 my $tempdest = $dest;
169 if (-d $tempdest) {
170 my ($filename) = $file =~ /([^\\\/]+)$/;
171 $tempdest .= "/$filename";
172 }
173 if (!-e $file) {
174 print STDERR "util::mv $file does not exist\n";
175 } else {
176 if(!rename ($file, $tempdest)) {
177 print STDERR "**** Failed to rename $file to $tempdest\n";
178 &File::Copy::copy($file, $tempdest);
179 &rm($file);
180 }
181 elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
182 #print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
183 if(!-e $tempdest) {
184 print STDERR "@@@@ ERROR: $tempdest does not exist\n";
185 }
186 # Sometimes the rename operation fails (as does File::Copy::move).
187 # This turns out to be because the files are hardlinked.
188 # Need to do a copy-delete in this case, however, the copy step is not necessary:
189 # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
190 #&File::Copy::copy($file, $tempdest);
191
192 &rm($file);
193 }
194 }
195 }
196}
197
198# Move the contents of source directory into target directory
199# (as opposed to merely replacing target dir with the src dir)
200# This can overwrite any files with duplicate names in the target
201# but other files and folders in the target will continue to exist
202sub mv_dir_contents {
203 my ($src_dir, $dest_dir) = @_;
204
205 # Obtain listing of all files within src_dir
206 # Note that readdir lists relative paths, as well as . and ..
207 opendir(DIR, "$src_dir");
208 my @files= readdir(DIR);
209 close(DIR);
210
211 my @full_path_files = ();
212 foreach my $file (@files) {
213 # process all except . and ..
214 unless($file eq "." || $file eq "..") {
215
216 my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
217
218 # construct absolute paths
219 $file = &filename_cat($src_dir, $file); # $file is now an absolute path
220
221 # Recurse on directories which have an equivalent in target dest_dir
222 # If $file is a directory that already exists in target $dest_dir,
223 # then a simple move operation will fail (definitely on Windows).
224 if(-d $file && -d $dest_subdir) {
225 #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
226 &mv_dir_contents($file, $dest_subdir);
227
228 # now all content is moved across, delete empty dir in source folder
229 if(&is_dir_empty($file)) {
230 if (!rmdir $file) {
231 print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
232 }
233 } else { # error
234 print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
235 }
236 } else { # process files and any directories that don't already exist with a simple move
237 push(@full_path_files, $file);
238 }
239 }
240 }
241
242 if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
243 &mk_dir($dest_dir);
244 }
245
246 #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n";
247
248 if(@full_path_files) { # if non-empty, there's something to copy across
249 &mv(@full_path_files, $dest_dir);
250 }
251}
252
253
254# copies a file or a group of files
255sub cp {
256 my $dest = pop (@_);
257 my (@srcfiles) = @_;
258
259 # remove trailing slashes from source and destination files
260 $dest =~ s/[\\\/]+$//;
261 map {$_ =~ s/[\\\/]+$//;} @srcfiles;
262
263 # a few sanity checks
264 if (scalar (@srcfiles) == 0) {
265 print STDERR "util::cp no destination directory given\n";
266 return;
267 } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
268 print STDERR "util::cp if multiple source files are given the ".
269 "destination must be a directory\n";
270 return;
271 }
272
273 # copy the files
274 foreach my $file (@srcfiles) {
275 my $tempdest = $dest;
276 if (-d $tempdest) {
277 my ($filename) = $file =~ /([^\\\/]+)$/;
278 $tempdest .= "/$filename";
279 }
280 if (!-e $file) {
281 print STDERR "util::cp $file does not exist\n";
282 } elsif (!-f $file) {
283 print STDERR "util::cp $file is not a plain file\n";
284 } else {
285 &File::Copy::copy ($file, $tempdest);
286 }
287 }
288}
289
290
291
292# recursively copies a file or group of files
293# syntax: cp_r (sourcefiles, destination directory)
294# destination must be a directory - to copy one file to
295# another use cp instead
296sub cp_r {
297 my $dest = pop (@_);
298 my (@srcfiles) = @_;
299
300 # a few sanity checks
301 if (scalar (@srcfiles) == 0) {
302 print STDERR "util::cp_r no destination directory given\n";
303 return;
304 } elsif (-f $dest) {
305 print STDERR "util::cp_r destination must be a directory\n";
306 return;
307 }
308
309 # create destination directory if it doesn't exist already
310 if (! -d $dest) {
311 my $store_umask = umask(0002);
312 mkdir ($dest, 0777);
313 umask($store_umask);
314 }
315
316 # copy the files
317 foreach my $file (@srcfiles) {
318
319 if (!-e $file) {
320 print STDERR "util::cp_r $file does not exist\n";
321
322 } elsif (-d $file) {
323 # make the new directory
324 my ($filename) = $file =~ /([^\\\/]*)$/;
325 $dest = &util::filename_cat ($dest, $filename);
326 my $store_umask = umask(0002);
327 mkdir ($dest, 0777);
328 umask($store_umask);
329
330 # get the contents of this directory
331 if (!opendir (INDIR, $file)) {
332 print STDERR "util::cp_r could not open directory $file\n";
333 } else {
334 my @filedir = readdir (INDIR);
335 closedir (INDIR);
336 foreach my $f (@filedir) {
337 next if $f =~ /^\.\.?$/;
338 # copy all the files in this directory
339 my $ff = &util::filename_cat ($file, $f);
340 &cp_r ($ff, $dest);
341 }
342 }
343
344 } else {
345 &cp($file, $dest);
346 }
347 }
348}
349# recursively copies a file or group of files
350# syntax: cp_r (sourcefiles, destination directory)
351# destination must be a directory - to copy one file to
352# another use cp instead
353sub cp_r_nosvn {
354 my $dest = pop (@_);
355 my (@srcfiles) = @_;
356
357 # a few sanity checks
358 if (scalar (@srcfiles) == 0) {
359 print STDERR "util::cp_r no destination directory given\n";
360 return;
361 } elsif (-f $dest) {
362 print STDERR "util::cp_r destination must be a directory\n";
363 return;
364 }
365
366 # create destination directory if it doesn't exist already
367 if (! -d $dest) {
368 my $store_umask = umask(0002);
369 mkdir ($dest, 0777);
370 umask($store_umask);
371 }
372
373 # copy the files
374 foreach my $file (@srcfiles) {
375
376 if (!-e $file) {
377 print STDERR "util::cp_r $file does not exist\n";
378
379 } elsif (-d $file) {
380 # make the new directory
381 my ($filename) = $file =~ /([^\\\/]*)$/;
382 $dest = &util::filename_cat ($dest, $filename);
383 my $store_umask = umask(0002);
384 mkdir ($dest, 0777);
385 umask($store_umask);
386
387 # get the contents of this directory
388 if (!opendir (INDIR, $file)) {
389 print STDERR "util::cp_r could not open directory $file\n";
390 } else {
391 my @filedir = readdir (INDIR);
392 closedir (INDIR);
393 foreach my $f (@filedir) {
394 next if $f =~ /^\.\.?$/;
395 next if $f =~ /^\.svn$/;
396 # copy all the files in this directory
397 my $ff = &util::filename_cat ($file, $f);
398 &cp_r ($ff, $dest);
399 }
400 }
401
402 } else {
403 &cp($file, $dest);
404 }
405 }
406}
407
408# copies a directory and its contents, excluding subdirectories, into a new directory
409sub cp_r_toplevel {
410 my $dest = pop (@_);
411 my (@srcfiles) = @_;
412
413 # a few sanity checks
414 if (scalar (@srcfiles) == 0) {
415 print STDERR "util::cp_r no destination directory given\n";
416 return;
417 } elsif (-f $dest) {
418 print STDERR "util::cp_r destination must be a directory\n";
419 return;
420 }
421
422 # create destination directory if it doesn't exist already
423 if (! -d $dest) {
424 my $store_umask = umask(0002);
425 mkdir ($dest, 0777);
426 umask($store_umask);
427 }
428
429 # copy the files
430 foreach my $file (@srcfiles) {
431
432 if (!-e $file) {
433 print STDERR "util::cp_r $file does not exist\n";
434
435 } elsif (-d $file) {
436 # make the new directory
437 my ($filename) = $file =~ /([^\\\/]*)$/;
438 $dest = &util::filename_cat ($dest, $filename);
439 my $store_umask = umask(0002);
440 mkdir ($dest, 0777);
441 umask($store_umask);
442
443 # get the contents of this directory
444 if (!opendir (INDIR, $file)) {
445 print STDERR "util::cp_r could not open directory $file\n";
446 } else {
447 my @filedir = readdir (INDIR);
448 closedir (INDIR);
449 foreach my $f (@filedir) {
450 next if $f =~ /^\.\.?$/;
451
452 # copy all the files in this directory, but not directories
453 my $ff = &util::filename_cat ($file, $f);
454 if (-f $ff) {
455 &cp($ff, $dest);
456 #&cp_r ($ff, $dest);
457 }
458 }
459 }
460
461 } else {
462 &cp($file, $dest);
463 }
464 }
465}
466
467sub mk_dir {
468 my ($dir) = @_;
469
470 my $store_umask = umask(0002);
471 my $mkdir_ok = mkdir ($dir, 0777);
472 umask($store_umask);
473
474 if (!$mkdir_ok)
475 {
476 print STDERR "util::mk_dir could not create directory $dir\n";
477 return;
478 }
479}
480
481# in case anyone cares - I did some testing (using perls Benchmark module)
482# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
483# slightly faster (surprisingly) - Stefan.
484sub mk_all_dir {
485 my ($dir) = @_;
486
487 # use / for the directory separator, remove duplicate and
488 # trailing slashes
489 $dir=~s/[\\\/]+/\//g;
490 $dir=~s/[\\\/]+$//;
491
492 # make sure the cache directory exists
493 my $dirsofar = "";
494 my $first = 1;
495 foreach my $dirname (split ("/", $dir)) {
496 $dirsofar .= "/" unless $first;
497 $first = 0;
498
499 $dirsofar .= $dirname;
500
501 next if $dirname =~ /^(|[a-z]:)$/i;
502 if (!-e $dirsofar)
503 {
504 my $store_umask = umask(0002);
505 my $mkdir_ok = mkdir ($dirsofar, 0777);
506 umask($store_umask);
507 if (!$mkdir_ok)
508 {
509 print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
510 return;
511 }
512 }
513 }
514}
515
516# make hard link to file if supported by OS, otherwise copy the file
517sub hard_link {
518 my ($src, $dest, $verbosity) = @_;
519
520 # remove trailing slashes from source and destination files
521 $src =~ s/[\\\/]+$//;
522 $dest =~ s/[\\\/]+$//;
523
524## print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
525 # a few sanity checks
526 if (-e $dest) {
527 # destination file already exists
528 return;
529 }
530 elsif (!-e $src) {
531 print STDERR "util::hard_link source file \"$src\" does not exist\n";
532 return 1;
533 }
534 elsif (-d $src) {
535 print STDERR "util::hard_link source \"$src\" is a directory\n";
536 return 1;
537 }
538
539 my $dest_dir = &File::Basename::dirname($dest);
540 mk_all_dir($dest_dir) if (!-e $dest_dir);
541
542
543 if (!link($src, $dest)) {
544 if ((!defined $verbosity) || ($verbosity>2)) {
545 print STDERR "util::hard_link: unable to create hard link. ";
546 print STDERR " Copying file: $src -> $dest\n";
547 }
548 &File::Copy::copy ($src, $dest);
549 }
550 return 0;
551}
552
553# make soft link to file if supported by OS, otherwise copy file
554sub soft_link {
555 my ($src, $dest, $ensure_paths_absolute) = @_;
556
557 # remove trailing slashes from source and destination files
558 $src =~ s/[\\\/]+$//;
559 $dest =~ s/[\\\/]+$//;
560
561 # Ensure file paths are absolute IF requested to do so
562 # Soft_linking didn't work for relative paths
563 if(defined $ensure_paths_absolute && $ensure_paths_absolute) {
564 # We need to ensure that the src file is the absolute path
565 # See http://perldoc.perl.org/File/Spec.html
566 if(!File::Spec->file_name_is_absolute( $src )) { # it's relative
567 $src = File::Spec->rel2abs($src); # make absolute
568 }
569 # Might as well ensure that the destination file's absolute path is used
570 if(!File::Spec->file_name_is_absolute( $dest )) {
571 $dest = File::Spec->rel2abs($dest); # make absolute
572 }
573 }
574
575 # a few sanity checks
576 if (!-e $src) {
577 print STDERR "util::soft_link source file $src does not exist\n";
578 return 0;
579 }
580
581 my $dest_dir = &File::Basename::dirname($dest);
582 mk_all_dir($dest_dir) if (!-e $dest_dir);
583
584 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
585
586 # symlink not supported on windows
587 &File::Copy::copy ($src, $dest);
588
589 } elsif (!eval {symlink($src, $dest)}) {
590 print STDERR "util::soft_link: unable to create soft link.\n";
591 return 0;
592 }
593
594 return 1;
595}
596
597# Primarily for filenames generated by processing
598# content of HTML files (which are mapped to UTF-8 internally)
599#
600# To turn this into an octet string that really exists on the file
601# system:
602# 1. don't need to do anything special for Unix-based systems
603# (as underlying file system is byte-code)
604# 2. need to map to short DOS filenames for Windows
605
606sub utf8_to_real_filename
607{
608 my ($utf8_filename) = @_;
609
610 my $real_filename;
611
612 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
613 require Win32;
614
615 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
616
617 my $unicode_filename = decode("utf8",$utf8_filename);
618 $real_filename = Win32::GetShortPathName($unicode_filename);
619 }
620 else {
621 $real_filename = $utf8_filename;
622 }
623
624 return $real_filename;
625}
626
627
628sub fd_exists
629{
630 my $filename_full_path = shift @_;
631 my $test_op = shift @_ || "-e";
632
633 # By default tests for existance of file or directory (-e)
634 # Can be made more specific by providing second parameter (e.g. -f or -d)
635
636 my $exists = 0;
637
638 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
639 require Win32;
640 my $filename_short_path = Win32::GetShortPathName($filename_full_path);
641 if (!defined $filename_short_path) {
642 # Was probably still in UTF8 form (not what is needed on Windows)
643 my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
644 if (defined $unicode_filename_full_path) {
645 $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
646 }
647 }
648 $filename_full_path = $filename_short_path;
649 }
650
651 if (defined $filename_full_path) {
652 $exists = eval "($test_op \$filename_full_path)";
653 }
654
655 return $exists;
656}
657
658sub file_exists
659{
660 my ($filename_full_path) = @_;
661
662 return fd_exists($filename_full_path,"-f");
663}
664
665sub dir_exists
666{
667 my ($filename_full_path) = @_;
668
669 return fd_exists($filename_full_path,"-d");
670}
671
672
673
674# updates a copy of a directory in some other part of the filesystem
675# verbosity settings are: 0=low, 1=normal, 2=high
676# both $fromdir and $todir should be absolute paths
677sub cachedir {
678 my ($fromdir, $todir, $verbosity) = @_;
679 $verbosity = 1 unless defined $verbosity;
680
681 # use / for the directory separator, remove duplicate and
682 # trailing slashes
683 $fromdir=~s/[\\\/]+/\//g;
684 $fromdir=~s/[\\\/]+$//;
685 $todir=~s/[\\\/]+/\//g;
686 $todir=~s/[\\\/]+$//;
687
688 &mk_all_dir ($todir);
689
690 # get the directories in ascending order
691 if (!opendir (FROMDIR, $fromdir)) {
692 print STDERR "util::cachedir could not read directory $fromdir\n";
693 return;
694 }
695 my @fromdir = grep (!/^\.\.?$/, sort(readdir (FROMDIR)));
696 closedir (FROMDIR);
697
698 if (!opendir (TODIR, $todir)) {
699 print STDERR "util::cacedir could not read directory $todir\n";
700 return;
701 }
702 my @todir = grep (!/^\.\.?$/, sort(readdir (TODIR)));
703 closedir (TODIR);
704
705 my $fromi = 0;
706 my $toi = 0;
707
708 while ($fromi < scalar(@fromdir) || $toi < scalar(@todir)) {
709# print "fromi: $fromi toi: $toi\n";
710
711 # see if we should delete a file/directory
712 # this should happen if the file/directory
713 # is not in the from list or if its a different
714 # size, or has an older timestamp
715 if ($toi < scalar(@todir)) {
716 if (($fromi >= scalar(@fromdir)) ||
717 ($todir[$toi] lt $fromdir[$fromi] ||
718 ($todir[$toi] eq $fromdir[$fromi] &&
719 &differentfiles("$fromdir/$fromdir[$fromi]","$todir/$todir[$toi]",
720 $verbosity)))) {
721
722 # the files are different
723 &rm_r("$todir/$todir[$toi]");
724 splice(@todir, $toi, 1); # $toi stays the same
725
726 } elsif ($todir[$toi] eq $fromdir[$fromi]) {
727 # the files are the same
728 # if it is a directory, check its contents
729 if (-d "$todir/$todir[$toi]") {
730 &cachedir ("$fromdir/$fromdir[$fromi]",
731 "$todir/$todir[$toi]", $verbosity);
732 }
733
734 $toi++;
735 $fromi++;
736 next;
737 }
738 }
739
740 # see if we should insert a file/directory
741 # we should insert a file/directory if there
742 # is no tofiles left or if the tofile does not exist
743 if ($fromi < scalar(@fromdir) && ($toi >= scalar(@todir) ||
744 $todir[$toi] gt $fromdir[$fromi])) {
745 &cp_r ("$fromdir/$fromdir[$fromi]", "$todir/$fromdir[$fromi]");
746 splice (@todir, $toi, 0, $fromdir[$fromi]);
747
748 $toi++;
749 $fromi++;
750 }
751 }
752}
753
754# this function returns -1 if either file is not found
755# assumes that $file1 and $file2 are absolute file names or
756# in the current directory
757# $file2 is allowed to be newer than $file1
758sub differentfiles {
759 my ($file1, $file2, $verbosity) = @_;
760 $verbosity = 1 unless defined $verbosity;
761
762 $file1 =~ s/\/+$//;
763 $file2 =~ s/\/+$//;
764
765 my ($file1name) = $file1 =~ /\/([^\/]*)$/;
766 my ($file2name) = $file2 =~ /\/([^\/]*)$/;
767
768 return -1 unless (-e $file1 && -e $file2);
769 if ($file1name ne $file2name) {
770 print STDERR "filenames are not the same\n" if ($verbosity >= 2);
771 return 1;
772 }
773
774 my @file1stat = stat ($file1);
775 my @file2stat = stat ($file2);
776
777 if (-d $file1) {
778 if (! -d $file2) {
779 print STDERR "one file is a directory\n" if ($verbosity >= 2);
780 return 1;
781 }
782 return 0;
783 }
784
785 # both must be regular files
786 unless (-f $file1 && -f $file2) {
787 print STDERR "one file is not a regular file\n" if ($verbosity >= 2);
788 return 1;
789 }
790
791 # the size of the files must be the same
792 if ($file1stat[7] != $file2stat[7]) {
793 print STDERR "different sized files\n" if ($verbosity >= 2);
794 return 1;
795 }
796
797 # the second file cannot be older than the first
798 if ($file1stat[9] > $file2stat[9]) {
799 print STDERR "file is older\n" if ($verbosity >= 2);
800 return 1;
801 }
802
803 return 0;
804}
805
806
807sub get_tmp_filename
808{
809 my $file_ext = shift(@_) || undef;
810
811 my $opt_dot_file_ext = "";
812 if (defined $file_ext) {
813 if ($file_ext !~ m/\./) {
814 # no dot, so needs one added in at start
815 $opt_dot_file_ext = ".$file_ext"
816 }
817 else {
818 # allow for "extensions" such as _metadata.txt to be handled
819 # gracefully
820 $opt_dot_file_ext = $file_ext;
821 }
822 }
823
824 my $tmpdir = filename_cat($ENV{'GSDLHOME'}, "tmp");
825 &mk_all_dir ($tmpdir) unless -e $tmpdir;
826
827 my $count = 1000;
828 my $rand = int(rand $count);
829 my $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
830
831 while (-e $full_tmp_filename) {
832 $rand = int(rand $count);
833 $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
834 $count++;
835 }
836
837 return $full_tmp_filename;
838}
839
840sub get_timestamped_tmp_folder
841{
842
843 my $tmp_dirname;
844 if(defined $ENV{'GSDLCOLLECTDIR'}) {
845 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
846 } elsif(defined $ENV{'GSDLHOME'}) {
847 $tmp_dirname = $ENV{'GSDLHOME'};
848 } else {
849 return undef;
850 }
851
852 $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
853 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
854
855 # add the timestamp into the path otherwise we can run into problems
856 # if documents have the same name
857 my $timestamp = time;
858 my $time_tmp_dirname = &util::filename_cat($tmp_dirname, $timestamp);
859 $tmp_dirname = $time_tmp_dirname;
860 my $i = 1;
861 while (-e $tmp_dirname) {
862 $tmp_dirname = "$time_tmp_dirname$i";
863 $i++;
864 }
865 &util::mk_dir($tmp_dirname);
866
867 return $tmp_dirname;
868}
869
870sub get_timestamped_tmp_filename_in_collection
871{
872
873 my ($input_filename, $output_ext) = @_;
874 # derive tmp filename from input filename
875 my ($tailname, $dirname, $suffix)
876 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
877
878 # softlink to collection tmp dir
879 my $tmp_dirname = &util::get_timestamped_tmp_folder();
880 $tmp_dirname = $dirname unless defined $tmp_dirname;
881
882 # following two steps copied from ConvertBinaryFile
883 # do we need them?? can't use them as is, as they use plugin methods.
884
885 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
886
887 # URLEncode this since htmls with images where the html filename is utf8 don't seem
888 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
889 # files on the filesystem.
890 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
891 if (defined $output_ext) {
892 $output_ext = ".$output_ext"; # add the dot
893 } else {
894 $output_ext = $suffix;
895 }
896 $output_ext= lc($output_ext);
897 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$output_ext");
898
899 return $tmp_filename;
900}
901
902sub get_toplevel_tmp_dir
903{
904 return filename_cat($ENV{'GSDLHOME'}, "tmp");
905}
906
907
908sub filename_to_regex {
909 my $filename = shift (@_);
910
911 # need to make single backslashes double so that regex works
912 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
913
914 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
915 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
916 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
917 $filename =~ s@\(@\\(@g; # escape brackets
918 $filename =~ s@\)@\\)@g; # escape brackets
919 $filename =~ s@\[@\\[@g; # escape brackets
920 $filename =~ s@\]@\\]@g; # escape brackets
921
922 return $filename;
923}
924
925sub unregex_filename {
926 my $filename = shift (@_);
927
928 # need to put doubled backslashes for regex back to single
929 $filename =~ s/\\\./\./g; # remove RE syntax for .
930 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
931 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
932 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
933 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
934
935 # \\ goes to \
936 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
937 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
938 return $filename;
939}
940
941sub filename_cat {
942 my $first_file = shift(@_);
943 my (@filenames) = @_;
944
945# Useful for debugging
946# -- might make sense to call caller(0) rather than (1)??
947# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
948# print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
949
950 # If first_file is not null or empty, then add it back into the list
951 if (defined $first_file && $first_file =~ /\S/) {
952 unshift(@filenames, $first_file);
953 }
954
955 my $filename = join("/", @filenames);
956
957 # remove duplicate slashes and remove the last slash
958 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
959 $filename =~ s/[\\\/]+/\\/g;
960 } else {
961 $filename =~ s/[\/]+/\//g;
962 # DB: want a filename abc\de.html to remain like this
963 }
964 $filename =~ s/[\\\/]$//;
965
966 return $filename;
967}
968
969
970sub pathname_cat {
971 my $first_path = shift(@_);
972 my (@pathnames) = @_;
973
974 # If first_path is not null or empty, then add it back into the list
975 if (defined $first_path && $first_path =~ /\S/) {
976 unshift(@pathnames, $first_path);
977 }
978
979 my $join_char;
980 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
981 $join_char = ";";
982 } else {
983 $join_char = ":";
984 }
985
986 my $pathname = join($join_char, @pathnames);
987
988 # remove duplicate slashes
989 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
990 $pathname =~ s/[\\\/]+/\\/g;
991 } else {
992 $pathname =~ s/[\/]+/\//g;
993 # DB: want a pathname abc\de.html to remain like this
994 }
995
996 return $pathname;
997}
998
999
1000sub tidy_up_oid {
1001 my ($OID) = @_;
1002 if ($OID =~ /\./) {
1003 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
1004 $OID =~ s/\.//g; #remove any periods
1005 }
1006 if ($OID =~ /^\s.*\s$/) {
1007 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
1008 # remove starting and trailing whitespace
1009 $OID =~ s/^\s+//;
1010 $OID =~ s/\s+$//;
1011 }
1012 if ($OID =~ /^[\d]*$/) {
1013 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
1014 $OID = "D" . $OID;
1015 }
1016
1017 return $OID;
1018}
1019
1020sub envvar_prepend {
1021 my ($var,$val) = @_;
1022
1023 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
1024 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
1025
1026 # do not prepend any value/path that's already in the environment variable
1027
1028 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1029 if (!defined($ENV{$var})) {
1030 $ENV{$var} = "$val";
1031 }
1032 elsif($ENV{$var} !~ m/$escaped_val/) {
1033 $ENV{$var} = "$val".$pathsep.$ENV{$var};
1034 }
1035}
1036
1037sub envvar_append {
1038 my ($var,$val) = @_;
1039
1040 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
1041 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
1042
1043 # do not append any value/path that's already in the environment variable
1044
1045 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1046 if (!defined($ENV{$var})) {
1047 $ENV{$var} = "$val";
1048 }
1049 elsif($ENV{$var} !~ m/$escaped_val/) {
1050 $ENV{$var} = $ENV{$var}.$pathsep."$val";
1051 }
1052}
1053
1054
1055# splits a filename into a prefix and a tail extension using the tail_re, or
1056# if that fails, splits on the file_extension . (dot)
1057sub get_prefix_and_tail_by_regex {
1058
1059 my ($filename,$tail_re) = @_;
1060
1061 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
1062 if ((!defined $file_prefix) || (!defined $file_ext)) {
1063 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
1064 }
1065
1066 return ($file_prefix,$file_ext);
1067}
1068
1069# get full path and file only path from a base_dir (which may be empty) and
1070# file (which may contain directories)
1071sub get_full_filenames {
1072 my ($base_dir, $file) = @_;
1073
1074 my $filename_full_path = $file;
1075 # add on directory if present
1076 $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\S/;
1077
1078 my $filename_no_path = $file;
1079
1080 # remove directory if present
1081 $filename_no_path =~ s/^.*[\/\\]//;
1082 return ($filename_full_path, $filename_no_path);
1083}
1084
1085# returns the path of a file without the filename -- ie. the directory the file is in
1086sub filename_head {
1087 my $filename = shift(@_);
1088
1089 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1090 $filename =~ s/[^\\\\]*$//;
1091 }
1092 else {
1093 $filename =~ s/[^\\\/]*$//;
1094 }
1095
1096 return $filename;
1097}
1098
1099
1100
1101# returns 1 if filename1 and filename2 point to the same
1102# file or directory
1103sub filenames_equal {
1104 my ($filename1, $filename2) = @_;
1105
1106 # use filename_cat to clean up trailing slashes and
1107 # multiple slashes
1108 $filename1 = filename_cat ($filename1);
1109 $filename2 = filename_cat ($filename2);
1110
1111 # filenames not case sensitive on windows
1112 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1113 $filename1 =~ tr/[A-Z]/[a-z]/;
1114 $filename2 =~ tr/[A-Z]/[a-z]/;
1115 }
1116 return 1 if $filename1 eq $filename2;
1117 return 0;
1118}
1119
1120# If filename is relative to within_dir, returns the relative path of filename to that directory
1121# with slashes in the filename returned as they were in the original (absolute) filename.
1122sub filename_within_directory
1123{
1124 my ($filename,$within_dir) = @_;
1125
1126 if ($within_dir !~ m/[\/\\]$/) {
1127 my $dirsep = &util::get_dirsep();
1128 $within_dir .= $dirsep;
1129 }
1130
1131 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
1132 if ($filename =~ m/^$within_dir(.*)$/) {
1133 $filename = $1;
1134 }
1135
1136 return $filename;
1137}
1138
1139# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
1140# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
1141# The subpath returned will also be a URL type filename.
1142sub filename_within_directory_url_format
1143{
1144 my ($filename,$within_dir) = @_;
1145
1146 # convert parameters only to / slashes if Windows
1147
1148 my $filename_urlformat = &filepath_to_url_format($filename);
1149 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
1150
1151 #if ($within_dir_urlformat !~ m/\/$/) {
1152 # make sure directory ends with a slash
1153 #$within_dir_urlformat .= "/";
1154 #}
1155
1156 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
1157
1158 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
1159
1160 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
1161 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
1162 $filename_urlformat = $1;
1163 }
1164
1165 return $filename_urlformat;
1166}
1167
1168# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
1169# since on Linux it doesn't represent a file separator but an escape char).
1170sub filepath_to_url_format
1171{
1172 my ($filepath) = @_;
1173 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1174 # Only need to worry about Windows, as Unix style directories already in url-format
1175 # Convert Windows style \ => /
1176 $filepath =~ s@\\@/@g;
1177 }
1178 return $filepath;
1179}
1180
1181# regex filepaths on windows may include \\ as path separator. Convert \\ to /
1182sub filepath_regex_to_url_format
1183{
1184 my ($filepath) = @_;
1185 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1186 # Only need to worry about Windows, as Unix style directories already in url-format
1187 # Convert Windows style \\ => /
1188 $filepath =~ s@\\\\@/@g;
1189 }
1190 return $filepath;
1191
1192}
1193
1194# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
1195# and ignores trailing /
1196# returns (file, dirs) dirs will be empty if no subdirs
1197sub url_fileparse
1198{
1199 my ($filepath) = @_;
1200 # remove trailing /
1201 $filepath =~ s@/$@@;
1202 if ($filepath !~ m@/@) {
1203 return ($filepath, "");
1204 }
1205 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
1206 return ($file, $dirs);
1207
1208}
1209
1210
1211sub filename_within_collection
1212{
1213 my ($filename) = @_;
1214
1215 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
1216
1217 if (defined $collect_dir) {
1218
1219 # if from within GSDLCOLLECTDIR, then remove directory prefix
1220 # so source_filename is realative to it. This is done to aid
1221 # portability, i.e. the collection can be moved to somewhere
1222 # else on the file system and the archives directory will still
1223 # work. This is needed, for example in the applet version of
1224 # GLI where GSDLHOME/collect on the server will be different to
1225 # the collect directory of the remove user. Of course,
1226 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
1227 # it back into a full pathname.
1228
1229 $filename = filename_within_directory($filename,$collect_dir);
1230 }
1231
1232 return $filename;
1233}
1234
1235sub prettyprint_file
1236{
1237 my ($base_dir,$file,$gli) = @_;
1238
1239 my $filename_full_path = &util::filename_cat($base_dir,$file);
1240
1241 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1242 require Win32;
1243
1244 # For some reason base_dir in the form c:/a/b/c
1245 # This leads to confusion later on, so turn it back into
1246 # the more usual Windows form
1247 $base_dir =~ s/\//\\/g;
1248 my $long_base_dir = Win32::GetLongPathName($base_dir);
1249 my $long_full_path = Win32::GetLongPathName($filename_full_path);
1250
1251 $file = filename_within_directory($long_full_path,$long_base_dir);
1252 $file = encode("utf8",$file) if ($gli);
1253 }
1254
1255 return $file;
1256}
1257
1258
1259sub upgrade_if_dos_filename
1260{
1261 my ($filename_full_path,$and_encode) = @_;
1262
1263 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1264 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
1265 # to its long (Windows) version
1266 my $long_filename = Win32::GetLongPathName($filename_full_path);
1267 if (defined $long_filename) {
1268 $filename_full_path = $long_filename;
1269 }
1270 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
1271 $filename_full_path =~ s/^(.):/\u$1:/;
1272 if ((defined $and_encode) && ($and_encode)) {
1273 $filename_full_path = encode("utf8",$filename_full_path);
1274 }
1275 }
1276
1277 return $filename_full_path;
1278}
1279
1280
1281sub downgrade_if_dos_filename
1282{
1283 my ($filename_full_path) = @_;
1284
1285 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1286 require Win32;
1287
1288 # Ensure the given long Windows filename is in a form that can
1289 # be opened by Perl => convert it to a short DOS-like filename
1290
1291 my $short_filename = Win32::GetShortPathName($filename_full_path);
1292 if (defined $short_filename) {
1293 $filename_full_path = $short_filename;
1294 }
1295 # Make sure initial drive letter is lower-case (to fit in
1296 # with rest of Greenstone)
1297 $filename_full_path =~ s/^(.):/\u$1:/;
1298 }
1299
1300 return $filename_full_path;
1301}
1302
1303sub block_filename
1304{
1305 my ($block_hash,$filename) = @_;
1306
1307 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
1308
1309 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
1310 my $lower_filename = lc($filename);
1311 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
1312# my $lower_drive = $filename;
1313# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
1314
1315# my $upper_drive = $filename;
1316# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
1317#
1318# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
1319# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
1320 }
1321 else {
1322 $block_hash->{'file_blocks'}->{$filename} = 1;
1323 }
1324}
1325
1326
1327sub filename_is_absolute
1328{
1329 my ($filename) = @_;
1330
1331 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1332 return ($filename =~ m/^(\w:)?\\/);
1333 }
1334 else {
1335 return ($filename =~ m/^\//);
1336 }
1337}
1338
1339
1340## @method make_absolute()
1341#
1342# Ensure the given file path is absolute in respect to the given base path.
1343#
1344# @param $base_dir A string denoting the base path the given dir must be
1345# absolute to.
1346# @param $dir The directory to be made absolute as a string. Note that the
1347# dir may already be absolute, in which case it will remain
1348# unchanged.
1349# @return The now absolute form of the directory as a string.
1350#
1351# @author John Thompson, DL Consulting Ltd.
1352# @copy 2006 DL Consulting Ltd.
1353#
1354#used in buildcol.pl, doesn't work for all cases --kjdon
1355sub make_absolute {
1356
1357 my ($base_dir, $dir) = @_;
1358### print STDERR "dir = $dir\n";
1359 $dir =~ s/[\\\/]+/\//g;
1360 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
1361 $dir =~ s|^/tmp_mnt||;
1362 1 while($dir =~ s|/[^/]*/\.\./|/|g);
1363 $dir =~ s|/[.][.]?/|/|g;
1364 $dir =~ tr|/|/|s;
1365### print STDERR "dir = $dir\n";
1366
1367 return $dir;
1368}
1369## make_absolute() ##
1370
1371sub get_dirsep {
1372
1373 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1374 return "\\";
1375 } else {
1376 return "\/";
1377 }
1378}
1379
1380sub get_os_dirsep {
1381
1382 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1383 return "\\\\";
1384 } else {
1385 return "\\\/";
1386 }
1387}
1388
1389sub get_re_dirsep {
1390
1391 return "\\\\|\\\/";
1392}
1393
1394
1395sub get_dirsep_tail {
1396 my ($filename) = @_;
1397
1398 # returns last part of directory or filename
1399 # On unix e.g. a/b.d => b.d
1400 # a/b/c => c
1401
1402 my $dirsep = get_re_dirsep();
1403 my @dirs = split (/$dirsep/, $filename);
1404 my $tail = pop @dirs;
1405
1406 # - caused problems under windows
1407 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1408
1409 return $tail;
1410}
1411
1412
1413# if this is running on windows we want binaries to end in
1414# .exe, otherwise they don't have to end in any extension
1415sub get_os_exe {
1416 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
1417 return "";
1418}
1419
1420
1421# test to see whether this is a big or little endian machine
1422sub is_little_endian
1423{
1424 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1425 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1426 # Otherwise, it's little endian
1427
1428 #return 0 if $^O =~ /^darwin$/i;
1429 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1430
1431 # Going back to stating exactly whether the machine is little endian
1432 # or big endian, without any special case for Macs. Since for rata it comes
1433 # back with little endian and for shuttle with bigendian.
1434 return (ord(substr(pack("s",1), 0, 1)) == 1);
1435}
1436
1437
1438# will return the collection name if successful, "" otherwise
1439sub use_collection {
1440 my ($collection, $collectdir) = @_;
1441
1442 if (!defined $collectdir || $collectdir eq "") {
1443 $collectdir = &filename_cat ($ENV{'GSDLHOME'}, "collect");
1444 }
1445
1446 # get and check the collection
1447 if (!defined($collection) || $collection eq "") {
1448 if (defined $ENV{'GSDLCOLLECTION'}) {
1449 $collection = $ENV{'GSDLCOLLECTION'};
1450 } else {
1451 print STDOUT "No collection specified\n";
1452 return "";
1453 }
1454 }
1455
1456 if ($collection eq "modelcol") {
1457 print STDOUT "You can't use modelcol.\n";
1458 return "";
1459 }
1460
1461 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1462 # are defined
1463 $ENV{'GSDLCOLLECTION'} = $collection;
1464 $ENV{'GSDLCOLLECTDIR'} = &filename_cat ($collectdir, $collection);
1465
1466 # make sure this collection exists
1467 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1468 print STDOUT "Invalid collection ($collection).\n";
1469 return "";
1470 }
1471
1472 # everything is ready to go
1473 return $collection;
1474}
1475
1476sub get_current_collection_name {
1477 return $ENV{'GSDLCOLLECTION'};
1478}
1479
1480
1481# will return the collection name if successful, "" otherwise.
1482# Like use_collection (above) but for greenstone 3 (taking account of site level)
1483
1484sub use_site_collection {
1485 my ($site, $collection, $collectdir) = @_;
1486
1487 if (!defined $collectdir || $collectdir eq "") {
1488 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1489 $collectdir = &filename_cat ($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1490 }
1491
1492 # collectdir explicitly set by this point (using $site variable if required).
1493 # Can call "old" gsdl2 use_collection now.
1494
1495 return use_collection($collection,$collectdir);
1496}
1497
1498
1499
1500sub locate_config_file
1501{
1502 my ($file) = @_;
1503
1504 my $locations = locate_config_files($file);
1505
1506 return shift @$locations; # returns undef if 'locations' is empty
1507}
1508
1509
1510sub locate_config_files
1511{
1512 my ($file) = @_;
1513
1514 my @locations = ();
1515
1516 if (-e $file) {
1517 # Clearly specified (most likely full filename)
1518 # No need to hunt in 'etc' directories, return value unchanged
1519 push(@locations,$file);
1520 }
1521 else {
1522 # Check for collection specific one before looking in global GSDL 'etc'
1523 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1524 my $test_collect_etc_filename
1525 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1526
1527 if (-e $test_collect_etc_filename) {
1528 push(@locations,$test_collect_etc_filename);
1529 }
1530 }
1531 my $test_main_etc_filename
1532 = &util::filename_cat($ENV{'GSDLHOME'},"etc", $file);
1533 if (-e $test_main_etc_filename) {
1534 push(@locations,$test_main_etc_filename);
1535 }
1536 }
1537
1538 return \@locations;
1539}
1540
1541
1542sub hyperlink_text
1543{
1544 my ($text) = @_;
1545
1546 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1547 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1548
1549 return $text;
1550}
1551
1552
1553# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1554# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1555sub is_dir_empty
1556{
1557 my ($path) = @_;
1558 opendir DIR, $path;
1559 while(my $entry = readdir DIR) {
1560 next if($entry =~ /^\.\.?$/);
1561 closedir DIR;
1562 return 0;
1563 }
1564 closedir DIR;
1565 return 1;
1566}
1567
1568# Returns the given filename converted using either URL encoding or base64
1569# encoding, as specified by $rename_method. If the given filename has no suffix
1570# (if it is just the tailname), then $no_suffix should be some defined value.
1571# rename_method can be url, none, base64
1572sub rename_file {
1573 my ($filename, $rename_method, $no_suffix) = @_;
1574
1575 if(!$filename) { # undefined or empty string
1576 return $filename;
1577 }
1578
1579 if (!$rename_method) {
1580 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1581 # Debugging information
1582 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1583 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1584 $rename_method = "url";
1585 } elsif($rename_method eq "none") {
1586 return $filename; # would have already been renamed
1587 }
1588
1589 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1590 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1591 ###$filename =~ s/ /_/g;
1592
1593 my ($tailname,$dirname,$suffix);
1594 if($no_suffix) { # given a tailname, no suffix
1595 ($tailname,$dirname) = File::Basename::fileparse($filename);
1596 }
1597 else {
1598 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1599 }
1600 if (!$suffix) {
1601 $suffix = "";
1602 }
1603 # This breaks GLI matching extracted metadata to files in Enrich panel, as
1604 # original is eg .JPG while gsdlsourcefilename ends up .jpg
1605 # Not sure why it was done in first place...
1606 #else {
1607 #$suffix = lc($suffix);
1608 #}
1609
1610 if ($rename_method eq "url") {
1611 $tailname = &unicode::url_encode($tailname);
1612 }
1613 elsif ($rename_method eq "base64") {
1614 $tailname = &unicode::base64_encode($tailname);
1615 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1616 }
1617
1618 $filename = "$tailname$suffix";
1619 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1620
1621 return $filename;
1622}
1623
1624
1625# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1626sub rename_ldb_or_bdb_file {
1627 my ($filename_no_ext) = @_;
1628
1629 my $new_filename = "$filename_no_ext.gdb";
1630 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1631 # try ldb
1632 my $old_filename = "$filename_no_ext.ldb";
1633
1634 if (-f $old_filename) {
1635 print STDERR "Renaming $old_filename to $new_filename\n";
1636 rename ($old_filename, $new_filename)
1637 || print STDERR "Rename failed: $!\n";
1638 return;
1639 }
1640 # try bdb
1641 $old_filename = "$filename_no_ext.bdb";
1642 if (-f $old_filename) {
1643 print STDERR "Renaming $old_filename to $new_filename\n";
1644 rename ($old_filename, $new_filename)
1645 || print STDERR "Rename failed: $!\n";
1646 return;
1647 }
1648}
1649
1650sub os_dir() {
1651
1652 my $gsdlarch = "";
1653 if(defined $ENV{'GSDLARCH'}) {
1654 $gsdlarch = $ENV{'GSDLARCH'};
1655 }
1656 return $ENV{'GSDLOS'}.$gsdlarch;
1657}
1658
1659# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1660# By default, /greenstone3 for GS3 or /greenstone for GS2.
1661sub get_greenstone_url_prefix() {
1662 # if already set on a previous occasion, just return that
1663 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1664 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1665
1666 my ($configfile, $urlprefix, $defaultUrlprefix);
1667 my @propertynames = ();
1668
1669 if($ENV{'GSDL3SRCHOME'}) {
1670 $defaultUrlprefix = "/greenstone3";
1671 $configfile = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1672 push(@propertynames, qw/path\s*\=/);
1673 } else {
1674 $defaultUrlprefix = "/greenstone";
1675 $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1676 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1677 }
1678
1679 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1680
1681 if(!$urlprefix) { # no values found for URL prefix, use default values
1682 $urlprefix = $defaultUrlprefix;
1683 } else {
1684 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1685 $urlprefix =~ s/^\///; # remove the starting slash
1686 my @dirs = split(/(\\|\/)/, $urlprefix);
1687 $urlprefix = shift(@dirs);
1688
1689 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1690 $urlprefix = "/$urlprefix";
1691 }
1692 }
1693
1694 # set for the future
1695 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1696# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1697 return $urlprefix;
1698}
1699
1700
1701# Given a config file (xml or java properties file) and a list/array of regular expressions
1702# that represent property names to match on, this function will return the value for the 1st
1703# matching property name. If the return value is undefined, no matching property was found.
1704sub extract_propvalue_from_file() {
1705 my ($configfile, $propertynames) = @_;
1706
1707 my $value;
1708 unless(open(FIN, "<$configfile")) {
1709 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1710 return $value; # not initialised
1711 }
1712
1713 # Read the entire file at once, as one single line, then close it
1714 my $filecontents;
1715 {
1716 local $/ = undef;
1717 $filecontents = <FIN>;
1718 }
1719 close(FIN);
1720
1721 foreach my $regex (@$propertynames) {
1722 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1723 if($value) {
1724 $value =~ s/^\"//; # remove any startquotes
1725 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1726 last; # found value for a matching property, break from loop
1727 }
1728 }
1729
1730 return $value;
1731}
1732
1733# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1734# given that perllib is in @INC in order to invoke this subroutine.
1735# Call as follows -- after setting up INC to include perllib and
1736# after setting up GSDLHOME and GSDLOS:
1737#
1738# require util;
1739# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1740#
1741sub setup_greenstone_env() {
1742 my ($GSDLHOME, $GSDLOS) = @_;
1743
1744 #my %env_map = ();
1745 # Get the localised ENV settings of running a localised source setup.bash
1746 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1747 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1748 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1749 if($GSDLOS =~ m/windows/i) {
1750 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1751 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1752 }
1753 if (!open(PIN, "$perl_command |")) {
1754 print STDERR ("Unable to execute command: $perl_command. $!\n");
1755 }
1756
1757 while (defined (my $perl_output_line = <PIN>)) {
1758 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1759 #$env_map{$key}=$value;
1760 $ENV{$key}=$value;
1761 }
1762 close (PIN);
1763
1764 # If any keys in $ENV don't occur in Greenstone's localised env
1765 # (stored in $env_map), delete those entries from $ENV
1766 #foreach $key (keys %ENV) {
1767 # if(!defined $env_map{$key}) {
1768 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1769 # delete $ENV{$key}; # del $ENV(key, value) pair
1770 # }
1771 #}
1772 #undef %env_map;
1773}
1774
1775sub get_perl_exec() {
1776 my $perl_exec = $^X; # may return just "perl"
1777
1778 if($ENV{'PERLPATH'}) {
1779 # OR: # $perl_exec = &util::filename_cat($ENV{'PERLPATH'},"perl");
1780 if($ENV{'GSDLOS'} =~ m/windows/) {
1781 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1782 } else {
1783 $perl_exec = "$ENV{'PERLPATH'}/perl";
1784 }
1785 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1786 # containing the full path to the current perl executable we're using
1787 $perl_exec = $Config{perlpath}; # configured path for perl
1788 if (!-e $perl_exec) { # may not point to location on this machine
1789 $perl_exec = $^X; # may return just "perl"
1790 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1791 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1792 }
1793 }
1794 }
1795
1796 return $perl_exec;
1797}
1798
1799# returns the path to the java command in the JRE included with GS (if any),
1800# quoted to safeguard any spaces in this path, otherwise a simple java
1801# command is returned which assumes and will try for a system java.
1802sub get_java_command {
1803 my $java = "java";
1804 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1805 # after running setup.bat or from GLI which also runs setup.bat
1806 my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
1807 if(-d $java_bin) {
1808 $java = &util::filename_cat($java_bin,"java");
1809 $java = "\"".$java."\""; # quoted to preserve spaces in path
1810 }
1811 }
1812 return $java;
1813}
1814
1815
1816# Given the qualified collection name (colgroup/collection),
1817# returns the collection and colgroup parts
1818sub get_collection_parts {
1819 # http://perldoc.perl.org/File/Basename.html
1820 # my($filename, $directories, $suffix) = fileparse($path);
1821 # "$directories contains everything up to and including the last directory separator in the $path
1822 # including the volume (if applicable). The remainder of the $path is the $filename."
1823 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1824
1825 my $qualified_collection = shift(@_);
1826
1827 # Since activate.pl can be launched from the command-line, including by a user,
1828 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1829 # Also allow for the accidental inclusion of multiple slashes
1830 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1831
1832 if(!defined $collection) {
1833 $collection = $colgroup;
1834 $colgroup = "";
1835 }
1836 return ($collection, $colgroup);
1837}
1838
1839# work out the "collectdir/collection" location
1840sub resolve_collection_dir {
1841 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1842
1843 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1844
1845 if (defined $collect_dir) {
1846 return &util::filename_cat($collect_dir,$colgroup, $collection);
1847 }
1848 elsif (defined($ENV{'GSDLCOLLECTDIR'})) {
1849 return $ENV{'GSDLCOLLECTDIR'};
1850 }
1851 else {
1852 if (defined $site) {
1853 return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
1854 }
1855 else {
1856 return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
1857 }
1858 }
1859}
1860
1861# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1862# a directory containing sequentially numbered images.
1863sub create_itemfile
1864{
1865 my ($output_dir, $convert_basename, $convert_to) = @_;
1866 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1867
1868 my $page_num = "";
1869 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1870
1871 # Sort files in the directory by page_num
1872 sub page_number {
1873 my ($dir) = @_;
1874 my ($pagenum) =($dir =~ m/^.*[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1875
1876 $pagenum = 1 unless defined $pagenum;
1877 return $pagenum;
1878 }
1879
1880 # sort the files in the directory in the order of page_num rather than lexically.
1881 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1882
1883 # work out if the numbering of the now sorted image files starts at 0 or not
1884 # by checking the number of the first _image_ file (skipping item files)
1885 my $starts_at_0 = 0;
1886 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1887 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1888 $starts_at_0 = 1;
1889 }
1890
1891 my $item_file = &util::filename_cat($output_dir, $convert_basename.".item");
1892 open(FILE,">$item_file");
1893 print FILE "<PagedDocument>\n";
1894
1895 foreach my $file (@dir_files){
1896 if ($file !~ /\.item/i){
1897 $page_num = page_number($file);
1898 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1899 print FILE " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1900 }
1901 }
1902
1903 print FILE "</PagedDocument>\n";
1904 close FILE;
1905 closedir DIR;
1906 return $item_file;
1907}
1908
19091;
Note: See TracBrowser for help on using the repository browser.