source: main/trunk/greenstone2/perllib/util.pm@ 26017

Last change on this file since 26017 was 26017, checked in by ak19, 12 years ago

Thanks to Kathy: File handle to item file generated by pagedimgplug needs to be closed, now that the code's moved to util. Previously a pl script called it from main and when main exited the file handle would have got closed automatically. Now that the code to create the item file is also invoked by the pdfbox ext (to extract images) and since that doesn't exit, the file handle needs to be closed. With this commit, PDFBox is able to extract images from PDFs when -pagedimg is turned on. But we found it doesn't convert PDFs that contain text into pages of images.

  • Property svn:keywords set to Author Date Id Revision
File size: 54.3 KB
RevLine 
[537]1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
26package util;
27
[23362]28use strict;
29
30use Encode;
[4]31use File::Copy;
[619]32use File::Basename;
[24362]33# Config for getting the perlpath in the recommended way, though it uses paths that are
34# hard-coded into the Config file that's generated upon configuring and compiling perl.
35# $^X works better in some cases to return the path to perl used to launch the script,
36# but if launched with plain "perl" (no full-path), that will be just what it returns.
37use Config;
[4]38
39# removes files (but not directories)
40sub rm {
41 my (@files) = @_;
[18469]42
[4]43 my @filefiles = ();
44
45 # make sure the files we want to delete exist
46 # and are regular files
[10046]47 foreach my $file (@files) {
[4]48 if (!-e $file) {
49 print STDERR "util::rm $file does not exist\n";
[721]50 } elsif ((!-f $file) && (!-l $file)) {
51 print STDERR "util::rm $file is not a regular (or symbolic) file\n";
[4]52 } else {
53 push (@filefiles, $file);
54 }
55 }
56
57 # remove the files
58 my $numremoved = unlink @filefiles;
59
60 # check to make sure all of them were removed
61 if ($numremoved != scalar(@filefiles)) {
62 print STDERR "util::rm Not all files were removed\n";
63 }
64}
65
[23249]66# removes files (but not directories) - can rename this to the default
67# "rm" subroutine when debugging the deletion of individual files.
68sub rm_debug {
69 my (@files) = @_;
70 my @filefiles = ();
[4]71
[23249]72 # make sure the files we want to delete exist
73 # and are regular files
74 foreach my $file (@files) {
75 if (!-e $file) {
76 print STDERR "util::rm $file does not exist\n";
77 } elsif ((!-f $file) && (!-l $file)) {
78 print STDERR "util::rm $file is not a regular (or symbolic) file\n";
79 } else { # debug message
80 unlink($file) or warn "Could not delete file $file: $!\n";
81 }
82 }
83}
[10211]84
[23249]85
[4]86# recursive removal
[10211]87sub filtered_rm_r {
88 my ($files,$file_accept_re,$file_reject_re) = @_;
[4]89
[24291]90# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(2);
91# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
92# print STDERR "** Calling method (2): $lcfilename:$cline $cpackage->$csubr\n";
93
[10211]94 my @files_array = (ref $files eq "ARRAY") ? @$files : ($files);
95
[4]96 # recursively remove the files
[10211]97 foreach my $file (@files_array) {
[4]98 $file =~ s/[\/\\]+$//; # remove trailing slashes
99
100 if (!-e $file) {
[10211]101 print STDERR "util::filtered_rm_r $file does not exist\n";
[4]102
[721]103 } elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link
[4]104 # get the contents of this directory
105 if (!opendir (INDIR, $file)) {
[10211]106 print STDERR "util::filtered_rm_r could not open directory $file\n";
[4]107 } else {
108 my @filedir = grep (!/^\.\.?$/, readdir (INDIR));
109 closedir (INDIR);
[10211]110
[4]111 # remove all the files in this directory
[10211]112 map {$_="$file/$_";} @filedir;
113 &filtered_rm_r (\@filedir,$file_accept_re,$file_reject_re);
[4]114
[10211]115 if (!defined $file_accept_re && !defined $file_reject_re) {
116 # remove this directory
117 if (!rmdir $file) {
118 print STDERR "util::filtered_rm_r couldn't remove directory $file\n";
119 }
[4]120 }
121 }
[10211]122 } else {
123 next if (defined $file_reject_re && ($file =~ m/$file_reject_re/));
[4]124
[10211]125 if ((!defined $file_accept_re) || ($file =~ m/$file_accept_re/)) {
126 # remove this file
127 &rm ($file);
128 }
[4]129 }
130 }
131}
132
[10211]133
134# recursive removal
135sub rm_r {
136 my (@files) = @_;
137
138 # use the more general (but reterospectively written function
139 # filtered_rm_r function()
140
141 filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
142}
143
144
145
146
[721]147# moves a file or a group of files
148sub mv {
149 my $dest = pop (@_);
150 my (@srcfiles) = @_;
[4]151
[721]152 # remove trailing slashes from source and destination files
153 $dest =~ s/[\\\/]+$//;
154 map {$_ =~ s/[\\\/]+$//;} @srcfiles;
155
156 # a few sanity checks
157 if (scalar (@srcfiles) == 0) {
158 print STDERR "util::mv no destination directory given\n";
159 return;
160 } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
161 print STDERR "util::mv if multiple source files are given the ".
162 "destination must be a directory\n";
163 return;
164 }
165
166 # move the files
[8716]167 foreach my $file (@srcfiles) {
[721]168 my $tempdest = $dest;
169 if (-d $tempdest) {
170 my ($filename) = $file =~ /([^\\\/]+)$/;
171 $tempdest .= "/$filename";
172 }
173 if (!-e $file) {
174 print STDERR "util::mv $file does not exist\n";
175 } else {
[25598]176 if(!rename ($file, $tempdest)) {
177 print STDERR "**** Failed to rename $file to $tempdest\n";
178 &File::Copy::copy($file, $tempdest);
179 &rm($file);
180 }
181 elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
182 #print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
183 if(!-e $tempdest) {
184 print STDERR "@@@@ ERROR: $tempdest does not exist\n";
185 }
186 # Sometimes the rename operation fails (as does File::Copy::move).
187 # This turns out to be because the files are hardlinked.
188 # Need to do a copy-delete in this case, however, the copy step is not necessary:
189 # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
190 #&File::Copy::copy($file, $tempdest);
191
192 &rm($file);
193 }
[721]194 }
195 }
196}
197
[25554]198# Move the contents of source directory into target directory
199# (as opposed to merely replacing target dir with the src dir)
200# This can overwrite any files with duplicate names in the target
201# but other files and folders in the target will continue to exist
202sub mv_dir_contents {
203 my ($src_dir, $dest_dir) = @_;
204
205 # Obtain listing of all files within src_dir
206 # Note that readdir lists relative paths, as well as . and ..
207 opendir(DIR, "$src_dir");
208 my @files= readdir(DIR);
[25579]209 close(DIR);
[25554]210
[25572]211 my @full_path_files = ();
[25554]212 foreach my $file (@files) {
213 # process all except . and ..
[25572]214 unless($file eq "." || $file eq "..") {
[25578]215
[25579]216 my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
[25578]217
218 # construct absolute paths
219 $file = &filename_cat($src_dir, $file); # $file is now an absolute path
220
221 # Recurse on directories which have an equivalent in target dest_dir
222 # If $file is a directory that already exists in target $dest_dir,
223 # then a simple move operation will fail (definitely on Windows).
224 if(-d $file && -d $dest_subdir) {
[25579]225 #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
[25578]226 &mv_dir_contents($file, $dest_subdir);
[25572]227
228 # now all content is moved across, delete empty dir in source folder
229 if(&is_dir_empty($file)) {
[25577]230 if (!rmdir $file) {
[25579]231 print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
[25577]232 }
[25572]233 } else { # error
[25598]234 print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
[25572]235 }
[25578]236 } else { # process files and any directories that don't already exist with a simple move
[25572]237 push(@full_path_files, $file);
238 }
239 }
[25554]240 }
[25572]241
242 if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
243 &mk_dir($dest_dir);
244 }
[25598]245
246 #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n";
247
[25572]248 if(@full_path_files) { # if non-empty, there's something to copy across
249 &mv(@full_path_files, $dest_dir);
250 }
[25554]251}
[721]252
[25554]253
[4]254# copies a file or a group of files
255sub cp {
256 my $dest = pop (@_);
257 my (@srcfiles) = @_;
258
259 # remove trailing slashes from source and destination files
260 $dest =~ s/[\\\/]+$//;
261 map {$_ =~ s/[\\\/]+$//;} @srcfiles;
262
263 # a few sanity checks
264 if (scalar (@srcfiles) == 0) {
265 print STDERR "util::cp no destination directory given\n";
266 return;
267 } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
268 print STDERR "util::cp if multiple source files are given the ".
269 "destination must be a directory\n";
270 return;
271 }
272
273 # copy the files
[8716]274 foreach my $file (@srcfiles) {
[4]275 my $tempdest = $dest;
276 if (-d $tempdest) {
277 my ($filename) = $file =~ /([^\\\/]+)$/;
278 $tempdest .= "/$filename";
279 }
280 if (!-e $file) {
281 print STDERR "util::cp $file does not exist\n";
282 } elsif (!-f $file) {
283 print STDERR "util::cp $file is not a plain file\n";
284 } else {
285 &File::Copy::copy ($file, $tempdest);
286 }
287 }
288}
289
290
[721]291
[4]292# recursively copies a file or group of files
[1454]293# syntax: cp_r (sourcefiles, destination directory)
294# destination must be a directory - to copy one file to
295# another use cp instead
[4]296sub cp_r {
297 my $dest = pop (@_);
298 my (@srcfiles) = @_;
299
300 # a few sanity checks
301 if (scalar (@srcfiles) == 0) {
[1454]302 print STDERR "util::cp_r no destination directory given\n";
[4]303 return;
[1454]304 } elsif (-f $dest) {
305 print STDERR "util::cp_r destination must be a directory\n";
[4]306 return;
307 }
308
[1454]309 # create destination directory if it doesn't exist already
310 if (! -d $dest) {
311 my $store_umask = umask(0002);
312 mkdir ($dest, 0777);
313 umask($store_umask);
314 }
315
[4]316 # copy the files
[8716]317 foreach my $file (@srcfiles) {
[4]318
319 if (!-e $file) {
[1454]320 print STDERR "util::cp_r $file does not exist\n";
[4]321
322 } elsif (-d $file) {
[1586]323 # make the new directory
324 my ($filename) = $file =~ /([^\\\/]*)$/;
325 $dest = &util::filename_cat ($dest, $filename);
326 my $store_umask = umask(0002);
327 mkdir ($dest, 0777);
328 umask($store_umask);
[836]329
[4]330 # get the contents of this directory
331 if (!opendir (INDIR, $file)) {
332 print STDERR "util::cp_r could not open directory $file\n";
333 } else {
[1454]334 my @filedir = readdir (INDIR);
[4]335 closedir (INDIR);
[8716]336 foreach my $f (@filedir) {
[1454]337 next if $f =~ /^\.\.?$/;
338 # copy all the files in this directory
339 my $ff = &util::filename_cat ($file, $f);
340 &cp_r ($ff, $dest);
341 }
[4]342 }
343
344 } else {
[1454]345 &cp($file, $dest);
[4]346 }
347 }
348}
[21762]349# recursively copies a file or group of files
350# syntax: cp_r (sourcefiles, destination directory)
351# destination must be a directory - to copy one file to
352# another use cp instead
353sub cp_r_nosvn {
354 my $dest = pop (@_);
355 my (@srcfiles) = @_;
[4]356
[21762]357 # a few sanity checks
358 if (scalar (@srcfiles) == 0) {
359 print STDERR "util::cp_r no destination directory given\n";
360 return;
361 } elsif (-f $dest) {
362 print STDERR "util::cp_r destination must be a directory\n";
363 return;
364 }
365
366 # create destination directory if it doesn't exist already
367 if (! -d $dest) {
368 my $store_umask = umask(0002);
369 mkdir ($dest, 0777);
370 umask($store_umask);
371 }
372
373 # copy the files
374 foreach my $file (@srcfiles) {
375
376 if (!-e $file) {
377 print STDERR "util::cp_r $file does not exist\n";
378
379 } elsif (-d $file) {
380 # make the new directory
381 my ($filename) = $file =~ /([^\\\/]*)$/;
382 $dest = &util::filename_cat ($dest, $filename);
383 my $store_umask = umask(0002);
384 mkdir ($dest, 0777);
385 umask($store_umask);
386
387 # get the contents of this directory
388 if (!opendir (INDIR, $file)) {
389 print STDERR "util::cp_r could not open directory $file\n";
390 } else {
391 my @filedir = readdir (INDIR);
392 closedir (INDIR);
393 foreach my $f (@filedir) {
394 next if $f =~ /^\.\.?$/;
395 next if $f =~ /^\.svn$/;
396 # copy all the files in this directory
397 my $ff = &util::filename_cat ($file, $f);
398 &cp_r ($ff, $dest);
399 }
400 }
401
402 } else {
403 &cp($file, $dest);
404 }
405 }
406}
407
[11179]408# copies a directory and its contents, excluding subdirectories, into a new directory
409sub cp_r_toplevel {
410 my $dest = pop (@_);
411 my (@srcfiles) = @_;
[4]412
[11179]413 # a few sanity checks
414 if (scalar (@srcfiles) == 0) {
415 print STDERR "util::cp_r no destination directory given\n";
416 return;
417 } elsif (-f $dest) {
418 print STDERR "util::cp_r destination must be a directory\n";
419 return;
420 }
421
422 # create destination directory if it doesn't exist already
423 if (! -d $dest) {
424 my $store_umask = umask(0002);
425 mkdir ($dest, 0777);
426 umask($store_umask);
427 }
428
429 # copy the files
430 foreach my $file (@srcfiles) {
431
432 if (!-e $file) {
433 print STDERR "util::cp_r $file does not exist\n";
434
435 } elsif (-d $file) {
436 # make the new directory
437 my ($filename) = $file =~ /([^\\\/]*)$/;
438 $dest = &util::filename_cat ($dest, $filename);
439 my $store_umask = umask(0002);
440 mkdir ($dest, 0777);
441 umask($store_umask);
442
443 # get the contents of this directory
444 if (!opendir (INDIR, $file)) {
445 print STDERR "util::cp_r could not open directory $file\n";
446 } else {
447 my @filedir = readdir (INDIR);
448 closedir (INDIR);
449 foreach my $f (@filedir) {
450 next if $f =~ /^\.\.?$/;
451
452 # copy all the files in this directory, but not directories
453 my $ff = &util::filename_cat ($file, $f);
454 if (-f $ff) {
455 &cp($ff, $dest);
456 #&cp_r ($ff, $dest);
457 }
458 }
459 }
460
461 } else {
462 &cp($file, $dest);
463 }
464 }
465}
466
[721]467sub mk_dir {
468 my ($dir) = @_;
469
[836]470 my $store_umask = umask(0002);
471 my $mkdir_ok = mkdir ($dir, 0777);
472 umask($store_umask);
473
474 if (!$mkdir_ok)
475 {
[721]476 print STDERR "util::mk_dir could not create directory $dir\n";
477 return;
478 }
479}
480
[1046]481# in case anyone cares - I did some testing (using perls Benchmark module)
482# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
483# slightly faster (surprisingly) - Stefan.
[4]484sub mk_all_dir {
485 my ($dir) = @_;
486
487 # use / for the directory separator, remove duplicate and
488 # trailing slashes
489 $dir=~s/[\\\/]+/\//g;
490 $dir=~s/[\\\/]+$//;
491
492 # make sure the cache directory exists
493 my $dirsofar = "";
494 my $first = 1;
[8716]495 foreach my $dirname (split ("/", $dir)) {
[4]496 $dirsofar .= "/" unless $first;
497 $first = 0;
498
499 $dirsofar .= $dirname;
500
501 next if $dirname =~ /^(|[a-z]:)$/i;
[836]502 if (!-e $dirsofar)
503 {
504 my $store_umask = umask(0002);
505 my $mkdir_ok = mkdir ($dirsofar, 0777);
506 umask($store_umask);
507 if (!$mkdir_ok)
508 {
509 print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
510 return;
511 }
512 }
[4]513 }
514}
515
[619]516# make hard link to file if supported by OS, otherwise copy the file
517sub hard_link {
[18463]518 my ($src, $dest, $verbosity) = @_;
[4]519
[619]520 # remove trailing slashes from source and destination files
521 $src =~ s/[\\\/]+$//;
522 $dest =~ s/[\\\/]+$//;
523
[23307]524## print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
[619]525 # a few sanity checks
[812]526 if (-e $dest) {
527 # destination file already exists
528 return;
529 }
530 elsif (!-e $src) {
[23307]531 print STDERR "util::hard_link source file \"$src\" does not exist\n";
[3628]532 return 1;
[619]533 }
534 elsif (-d $src) {
[23307]535 print STDERR "util::hard_link source \"$src\" is a directory\n";
[3628]536 return 1;
[619]537 }
538
539 my $dest_dir = &File::Basename::dirname($dest);
540 mk_all_dir($dest_dir) if (!-e $dest_dir);
541
[14365]542
[22119]543 if (!link($src, $dest)) {
[18463]544 if ((!defined $verbosity) || ($verbosity>2)) {
545 print STDERR "util::hard_link: unable to create hard link. ";
546 print STDERR " Copying file: $src -> $dest\n";
547 }
[14365]548 &File::Copy::copy ($src, $dest);
[619]549 }
[3628]550 return 0;
[619]551}
552
[2193]553# make soft link to file if supported by OS, otherwise copy file
[721]554sub soft_link {
[15165]555 my ($src, $dest, $ensure_paths_absolute) = @_;
[619]556
[721]557 # remove trailing slashes from source and destination files
558 $src =~ s/[\\\/]+$//;
559 $dest =~ s/[\\\/]+$//;
[619]560
[15165]561 # Ensure file paths are absolute IF requested to do so
562 # Soft_linking didn't work for relative paths
563 if(defined $ensure_paths_absolute && $ensure_paths_absolute) {
564 # We need to ensure that the src file is the absolute path
565 # See http://perldoc.perl.org/File/Spec.html
566 if(!File::Spec->file_name_is_absolute( $src )) { # it's relative
567 $src = File::Spec->rel2abs($src); # make absolute
568 }
569 # Might as well ensure that the destination file's absolute path is used
570 if(!File::Spec->file_name_is_absolute( $dest )) {
571 $dest = File::Spec->rel2abs($dest); # make absolute
572 }
573 }
574
[721]575 # a few sanity checks
576 if (!-e $src) {
577 print STDERR "util::soft_link source file $src does not exist\n";
578 return 0;
579 }
[619]580
[721]581 my $dest_dir = &File::Basename::dirname($dest);
582 mk_all_dir($dest_dir) if (!-e $dest_dir);
[14365]583
[2193]584 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[23484]585
[14365]586 # symlink not supported on windows
587 &File::Copy::copy ($src, $dest);
[2193]588
589 } elsif (!eval {symlink($src, $dest)}) {
[2974]590 print STDERR "util::soft_link: unable to create soft link.\n";
[721]591 return 0;
592 }
593
594 return 1;
595}
596
[23362]597# Primarily for filenames generated by processing
598# content of HTML files (which are mapped to UTF-8 internally)
599#
600# To turn this into an octet string that really exists on the file
601# system:
602# 1. don't need to do anything special for Unix-based systems
603# (as underlying file system is byte-code)
604# 2. need to map to short DOS filenames for Windows
[721]605
[23362]606sub utf8_to_real_filename
607{
608 my ($utf8_filename) = @_;
[721]609
[23362]610 my $real_filename;
[721]611
[23362]612 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
613 require Win32;
[23388]614
615 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
616
[23362]617 my $unicode_filename = decode("utf8",$utf8_filename);
618 $real_filename = Win32::GetShortPathName($unicode_filename);
619 }
620 else {
621 $real_filename = $utf8_filename;
622 }
623
624 return $real_filename;
625}
626
627
628sub fd_exists
629{
630 my $filename_full_path = shift @_;
631 my $test_op = shift @_ || "-e";
632
633 # By default tests for existance of file or directory (-e)
634 # Can be made more specific by providing second parameter (e.g. -f or -d)
635
636 my $exists = 0;
637
638 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
639 require Win32;
640 my $filename_short_path = Win32::GetShortPathName($filename_full_path);
641 if (!defined $filename_short_path) {
642 # Was probably still in UTF8 form (not what is needed on Windows)
643 my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
644 if (defined $unicode_filename_full_path) {
645 $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
646 }
647 }
648 $filename_full_path = $filename_short_path;
649 }
650
651 if (defined $filename_full_path) {
652 $exists = eval "($test_op \$filename_full_path)";
653 }
654
655 return $exists;
656}
657
658sub file_exists
659{
660 my ($filename_full_path) = @_;
661
662 return fd_exists($filename_full_path,"-f");
663}
664
665sub dir_exists
666{
667 my ($filename_full_path) = @_;
668
669 return fd_exists($filename_full_path,"-d");
670}
671
672
673
[4]674# updates a copy of a directory in some other part of the filesystem
675# verbosity settings are: 0=low, 1=normal, 2=high
676# both $fromdir and $todir should be absolute paths
677sub cachedir {
678 my ($fromdir, $todir, $verbosity) = @_;
679 $verbosity = 1 unless defined $verbosity;
680
681 # use / for the directory separator, remove duplicate and
682 # trailing slashes
683 $fromdir=~s/[\\\/]+/\//g;
684 $fromdir=~s/[\\\/]+$//;
685 $todir=~s/[\\\/]+/\//g;
686 $todir=~s/[\\\/]+$//;
687
688 &mk_all_dir ($todir);
689
690 # get the directories in ascending order
691 if (!opendir (FROMDIR, $fromdir)) {
692 print STDERR "util::cachedir could not read directory $fromdir\n";
693 return;
694 }
695 my @fromdir = grep (!/^\.\.?$/, sort(readdir (FROMDIR)));
696 closedir (FROMDIR);
697
698 if (!opendir (TODIR, $todir)) {
699 print STDERR "util::cacedir could not read directory $todir\n";
700 return;
701 }
702 my @todir = grep (!/^\.\.?$/, sort(readdir (TODIR)));
703 closedir (TODIR);
704
705 my $fromi = 0;
706 my $toi = 0;
707
708 while ($fromi < scalar(@fromdir) || $toi < scalar(@todir)) {
709# print "fromi: $fromi toi: $toi\n";
710
711 # see if we should delete a file/directory
712 # this should happen if the file/directory
713 # is not in the from list or if its a different
714 # size, or has an older timestamp
715 if ($toi < scalar(@todir)) {
716 if (($fromi >= scalar(@fromdir)) ||
717 ($todir[$toi] lt $fromdir[$fromi] ||
718 ($todir[$toi] eq $fromdir[$fromi] &&
719 &differentfiles("$fromdir/$fromdir[$fromi]","$todir/$todir[$toi]",
720 $verbosity)))) {
721
722 # the files are different
723 &rm_r("$todir/$todir[$toi]");
724 splice(@todir, $toi, 1); # $toi stays the same
725
726 } elsif ($todir[$toi] eq $fromdir[$fromi]) {
727 # the files are the same
728 # if it is a directory, check its contents
729 if (-d "$todir/$todir[$toi]") {
730 &cachedir ("$fromdir/$fromdir[$fromi]",
731 "$todir/$todir[$toi]", $verbosity);
732 }
733
734 $toi++;
735 $fromi++;
736 next;
737 }
738 }
739
740 # see if we should insert a file/directory
741 # we should insert a file/directory if there
742 # is no tofiles left or if the tofile does not exist
743 if ($fromi < scalar(@fromdir) && ($toi >= scalar(@todir) ||
744 $todir[$toi] gt $fromdir[$fromi])) {
745 &cp_r ("$fromdir/$fromdir[$fromi]", "$todir/$fromdir[$fromi]");
746 splice (@todir, $toi, 0, $fromdir[$fromi]);
747
748 $toi++;
749 $fromi++;
750 }
751 }
752}
753
754# this function returns -1 if either file is not found
755# assumes that $file1 and $file2 are absolute file names or
756# in the current directory
757# $file2 is allowed to be newer than $file1
758sub differentfiles {
759 my ($file1, $file2, $verbosity) = @_;
760 $verbosity = 1 unless defined $verbosity;
761
762 $file1 =~ s/\/+$//;
763 $file2 =~ s/\/+$//;
764
765 my ($file1name) = $file1 =~ /\/([^\/]*)$/;
766 my ($file2name) = $file2 =~ /\/([^\/]*)$/;
767
768 return -1 unless (-e $file1 && -e $file2);
769 if ($file1name ne $file2name) {
770 print STDERR "filenames are not the same\n" if ($verbosity >= 2);
771 return 1;
772 }
773
[8716]774 my @file1stat = stat ($file1);
775 my @file2stat = stat ($file2);
[4]776
777 if (-d $file1) {
778 if (! -d $file2) {
779 print STDERR "one file is a directory\n" if ($verbosity >= 2);
780 return 1;
781 }
782 return 0;
783 }
784
785 # both must be regular files
786 unless (-f $file1 && -f $file2) {
787 print STDERR "one file is not a regular file\n" if ($verbosity >= 2);
788 return 1;
789 }
790
791 # the size of the files must be the same
792 if ($file1stat[7] != $file2stat[7]) {
793 print STDERR "different sized files\n" if ($verbosity >= 2);
794 return 1;
795 }
796
797 # the second file cannot be older than the first
798 if ($file1stat[9] > $file2stat[9]) {
799 print STDERR "file is older\n" if ($verbosity >= 2);
800 return 1;
801 }
802
803 return 0;
804}
805
806
[16266]807sub get_tmp_filename
808{
809 my $file_ext = shift(@_) || undef;
810
[22438]811 my $opt_dot_file_ext = "";
812 if (defined $file_ext) {
813 if ($file_ext !~ m/\./) {
814 # no dot, so needs one added in at start
815 $opt_dot_file_ext = ".$file_ext"
816 }
817 else {
818 # allow for "extensions" such as _metadata.txt to be handled
819 # gracefully
820 $opt_dot_file_ext = $file_ext;
821 }
822 }
[16266]823
[2795]824 my $tmpdir = filename_cat($ENV{'GSDLHOME'}, "tmp");
[4]825 &mk_all_dir ($tmpdir) unless -e $tmpdir;
826
827 my $count = 1000;
828 my $rand = int(rand $count);
[16266]829 my $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
830
831 while (-e $full_tmp_filename) {
[4]832 $rand = int(rand $count);
[16266]833 $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
[4]834 $count++;
835 }
[16266]836
837 return $full_tmp_filename;
[4]838}
839
[22886]840sub get_timestamped_tmp_folder
[22873]841{
842
[22886]843 my $tmp_dirname;
[22873]844 if(defined $ENV{'GSDLCOLLECTDIR'}) {
845 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
846 } elsif(defined $ENV{'GSDLHOME'}) {
847 $tmp_dirname = $ENV{'GSDLHOME'};
[22886]848 } else {
849 return undef;
[22873]850 }
851
852 $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
853 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
854
855 # add the timestamp into the path otherwise we can run into problems
856 # if documents have the same name
857 my $timestamp = time;
858 my $time_tmp_dirname = &util::filename_cat($tmp_dirname, $timestamp);
859 $tmp_dirname = $time_tmp_dirname;
860 my $i = 1;
861 while (-e $tmp_dirname) {
862 $tmp_dirname = "$time_tmp_dirname$i";
863 $i++;
864 }
865 &util::mk_dir($tmp_dirname);
866
[22886]867 return $tmp_dirname;
868}
[22873]869
[22886]870sub get_timestamped_tmp_filename_in_collection
871{
872
873 my ($input_filename, $output_ext) = @_;
874 # derive tmp filename from input filename
875 my ($tailname, $dirname, $suffix)
876 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
877
878 # softlink to collection tmp dir
879 my $tmp_dirname = &util::get_timestamped_tmp_folder();
880 $tmp_dirname = $dirname unless defined $tmp_dirname;
881
[22873]882 # following two steps copied from ConvertBinaryFile
[22886]883 # do we need them?? can't use them as is, as they use plugin methods.
884
[22873]885 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
886
887 # URLEncode this since htmls with images where the html filename is utf8 don't seem
888 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
889 # files on the filesystem.
890 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
891 if (defined $output_ext) {
892 $output_ext = ".$output_ext"; # add the dot
893 } else {
894 $output_ext = $suffix;
895 }
896 $output_ext= lc($output_ext);
897 my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$output_ext");
898
899 return $tmp_filename;
900}
901
[21218]902sub get_toplevel_tmp_dir
903{
904 return filename_cat($ENV{'GSDLHOME'}, "tmp");
905}
906
907
[17512]908sub filename_to_regex {
909 my $filename = shift (@_);
[4]910
[24971]911 # need to make single backslashes double so that regex works
[24832]912 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
[24829]913
[24832]914 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
915 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
[24829]916 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
917 $filename =~ s@\(@\\(@g; # escape brackets
918 $filename =~ s@\)@\\)@g; # escape brackets
[24932]919 $filename =~ s@\[@\\[@g; # escape brackets
920 $filename =~ s@\]@\\]@g; # escape brackets
[24829]921
[17512]922 return $filename;
923}
924
[24829]925sub unregex_filename {
926 my $filename = shift (@_);
927
928 # need to put doubled backslashes for regex back to single
929 $filename =~ s/\\\./\./g; # remove RE syntax for .
930 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
931 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
[24932]932 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
933 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
[24940]934
935 # \\ goes to \
936 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
937 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
[24829]938 return $filename;
939}
940
[4]941sub filename_cat {
[7507]942 my $first_file = shift(@_);
[4]943 my (@filenames) = @_;
[10146]944
[16266]945# Useful for debugging
946# -- might make sense to call caller(0) rather than (1)??
947# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[22856]948# print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
[18913]949
950 # If first_file is not null or empty, then add it back into the list
951 if (defined $first_file && $first_file =~ /\S/) {
[7507]952 unshift(@filenames, $first_file);
953 }
954
[4]955 my $filename = join("/", @filenames);
956
957 # remove duplicate slashes and remove the last slash
[488]958 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
959 $filename =~ s/[\\\/]+/\\/g;
960 } else {
[836]961 $filename =~ s/[\/]+/\//g;
962 # DB: want a filename abc\de.html to remain like this
[488]963 }
964 $filename =~ s/[\\\/]$//;
[4]965
966 return $filename;
967}
968
[21413]969
970sub pathname_cat {
971 my $first_path = shift(@_);
972 my (@pathnames) = @_;
973
974 # If first_path is not null or empty, then add it back into the list
975 if (defined $first_path && $first_path =~ /\S/) {
976 unshift(@pathnames, $first_path);
977 }
978
[21425]979 my $join_char;
[21413]980 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
981 $join_char = ";";
982 } else {
983 $join_char = ":";
984 }
985
986 my $pathname = join($join_char, @pathnames);
987
988 # remove duplicate slashes
989 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
990 $pathname =~ s/[\\\/]+/\\/g;
991 } else {
992 $pathname =~ s/[\/]+/\//g;
993 # DB: want a pathname abc\de.html to remain like this
994 }
995
996 return $pathname;
997}
998
999
[19616]1000sub tidy_up_oid {
1001 my ($OID) = @_;
1002 if ($OID =~ /\./) {
1003 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
1004 $OID =~ s/\.//g; #remove any periods
1005 }
1006 if ($OID =~ /^\s.*\s$/) {
1007 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
1008 # remove starting and trailing whitespace
1009 $OID =~ s/^\s+//;
1010 $OID =~ s/\s+$//;
1011 }
1012 if ($OID =~ /^[\d]*$/) {
1013 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
1014 $OID = "D" . $OID;
1015 }
1016
1017 return $OID;
1018}
[10212]1019sub envvar_prepend {
1020 my ($var,$val) = @_;
1021
[16404]1022 # do not prepend any value/path that's already in the environment variable
[24832]1023
1024 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1025 if (!defined($ENV{$var})) {
1026 $ENV{$var} = "$val";
[16442]1027 }
[24832]1028 elsif($ENV{$var} !~ m/$escaped_val/) {
1029 $ENV{$var} = "$val;".$ENV{$var};
[10212]1030 }
1031}
1032
1033sub envvar_append {
1034 my ($var,$val) = @_;
[24832]1035
[16404]1036 # do not append any value/path that's already in the environment variable
[24832]1037
1038 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1039 if (!defined($ENV{$var})) {
1040 $ENV{$var} = "$val";
[16442]1041 }
[24832]1042 elsif($ENV{$var} !~ m/$escaped_val/) {
1043 $ENV{$var} .= ";$val";
1044 }
[10212]1045}
1046
[16442]1047
[16380]1048# splits a filename into a prefix and a tail extension using the tail_re, or
1049# if that fails, splits on the file_extension . (dot)
1050sub get_prefix_and_tail_by_regex {
[10212]1051
[16380]1052 my ($filename,$tail_re) = @_;
1053
1054 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
1055 if ((!defined $file_prefix) || (!defined $file_ext)) {
1056 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
1057 }
1058
1059 return ($file_prefix,$file_ext);
1060}
1061
1062# get full path and file only path from a base_dir (which may be empty) and
1063# file (which may contain directories)
1064sub get_full_filenames {
1065 my ($base_dir, $file) = @_;
1066
1067 my $filename_full_path = $file;
1068 # add on directory if present
1069 $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\S/;
1070
1071 my $filename_no_path = $file;
1072
1073 # remove directory if present
1074 $filename_no_path =~ s/^.*[\/\\]//;
1075 return ($filename_full_path, $filename_no_path);
1076}
1077
[8682]1078# returns the path of a file without the filename -- ie. the directory the file is in
1079sub filename_head {
1080 my $filename = shift(@_);
1081
1082 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1083 $filename =~ s/[^\\\\]*$//;
1084 }
1085 else {
1086 $filename =~ s/[^\\\/]*$//;
1087 }
1088
1089 return $filename;
1090}
1091
1092
[23362]1093
[1454]1094# returns 1 if filename1 and filename2 point to the same
1095# file or directory
1096sub filenames_equal {
1097 my ($filename1, $filename2) = @_;
1098
1099 # use filename_cat to clean up trailing slashes and
1100 # multiple slashes
1101 $filename1 = filename_cat ($filename1);
[2516]1102 $filename2 = filename_cat ($filename2);
[1454]1103
1104 # filenames not case sensitive on windows
1105 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1106 $filename1 =~ tr/[A-Z]/[a-z]/;
1107 $filename2 =~ tr/[A-Z]/[a-z]/;
1108 }
1109 return 1 if $filename1 eq $filename2;
1110 return 0;
1111}
1112
[24932]1113# If filename is relative to within_dir, returns the relative path of filename to that directory
1114# with slashes in the filename returned as they were in the original (absolute) filename.
[23362]1115sub filename_within_directory
1116{
1117 my ($filename,$within_dir) = @_;
1118
[23371]1119 if ($within_dir !~ m/[\/\\]$/) {
1120 my $dirsep = &util::get_dirsep();
[23362]1121 $within_dir .= $dirsep;
1122 }
1123
[24829]1124 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
[23362]1125 if ($filename =~ m/^$within_dir(.*)$/) {
1126 $filename = $1;
1127 }
1128
1129 return $filename;
1130}
1131
[24932]1132# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
1133# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
1134# The subpath returned will also be a URL type filename.
1135sub filename_within_directory_url_format
1136{
1137 my ($filename,$within_dir) = @_;
1138
1139 # convert parameters only to / slashes if Windows
1140
[24971]1141 my $filename_urlformat = &filepath_to_url_format($filename);
1142 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
1143
[24932]1144 #if ($within_dir_urlformat !~ m/\/$/) {
1145 # make sure directory ends with a slash
1146 #$within_dir_urlformat .= "/";
1147 #}
1148
1149 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
1150
1151 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
1152
1153 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
1154 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
1155 $filename_urlformat = $1;
1156 }
1157
1158 return $filename_urlformat;
1159}
1160
[24971]1161# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
1162# since on Linux it doesn't represent a file separator but an escape char).
1163sub filepath_to_url_format
1164{
1165 my ($filepath) = @_;
1166 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1167 # Only need to worry about Windows, as Unix style directories already in url-format
1168 # Convert Windows style \ => /
1169 $filepath =~ s@\\@/@g;
1170 }
1171 return $filepath;
1172}
[24932]1173
[25093]1174# regex filepaths on windows may include \\ as path separator. Convert \\ to /
1175sub filepath_regex_to_url_format
1176{
1177 my ($filepath) = @_;
1178 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1179 # Only need to worry about Windows, as Unix style directories already in url-format
1180 # Convert Windows style \\ => /
1181 $filepath =~ s@\\\\@/@g;
1182 }
1183 return $filepath;
1184
1185}
[24971]1186
[25093]1187# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
1188# and ignores trailing /
1189# returns (file, dirs) dirs will be empty if no subdirs
1190sub url_fileparse
1191{
1192 my ($filepath) = @_;
1193 # remove trailing /
1194 $filepath =~ s@/$@@;
1195 if ($filepath !~ m@/@) {
1196 return ($filepath, "");
1197 }
1198 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
1199 return ($file, $dirs);
1200
1201}
1202
1203
[10281]1204sub filename_within_collection
1205{
1206 my ($filename) = @_;
1207
1208 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
1209
1210 if (defined $collect_dir) {
[23362]1211
[15875]1212 # if from within GSDLCOLLECTDIR, then remove directory prefix
1213 # so source_filename is realative to it. This is done to aid
1214 # portability, i.e. the collection can be moved to somewhere
1215 # else on the file system and the archives directory will still
1216 # work. This is needed, for example in the applet version of
1217 # GLI where GSDLHOME/collect on the server will be different to
1218 # the collect directory of the remove user. Of course,
1219 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
1220 # it back into a full pathname.
[23362]1221
1222 $filename = filename_within_directory($filename,$collect_dir);
[10281]1223 }
1224
1225 return $filename;
1226}
1227
[23362]1228sub prettyprint_file
1229{
[23484]1230 my ($base_dir,$file,$gli) = @_;
[23362]1231
1232 my $filename_full_path = &util::filename_cat($base_dir,$file);
1233
1234 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1235 require Win32;
1236
1237 # For some reason base_dir in the form c:/a/b/c
1238 # This leads to confusion later on, so turn it back into
1239 # the more usual Windows form
1240 $base_dir =~ s/\//\\/g;
1241 my $long_base_dir = Win32::GetLongPathName($base_dir);
1242 my $long_full_path = Win32::GetLongPathName($filename_full_path);
1243
1244 $file = filename_within_directory($long_full_path,$long_base_dir);
[23484]1245 $file = encode("utf8",$file) if ($gli);
[23362]1246 }
1247
1248 return $file;
1249}
1250
1251
1252sub upgrade_if_dos_filename
1253{
[23371]1254 my ($filename_full_path,$and_encode) = @_;
[23362]1255
1256 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1257 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
1258 # to its long (Windows) version
[23416]1259 my $long_filename = Win32::GetLongPathName($filename_full_path);
1260 if (defined $long_filename) {
1261 $filename_full_path = $long_filename;
1262 }
[23362]1263 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
[23483]1264 $filename_full_path =~ s/^(.):/\u$1:/;
[23371]1265 if ((defined $and_encode) && ($and_encode)) {
1266 $filename_full_path = encode("utf8",$filename_full_path);
1267 }
[23362]1268 }
1269
1270 return $filename_full_path;
1271}
1272
1273
[23388]1274sub downgrade_if_dos_filename
1275{
1276 my ($filename_full_path) = @_;
1277
1278 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1279 require Win32;
1280
1281 # Ensure the given long Windows filename is in a form that can
1282 # be opened by Perl => convert it to a short DOS-like filename
1283
[23414]1284 my $short_filename = Win32::GetShortPathName($filename_full_path);
1285 if (defined $short_filename) {
1286 $filename_full_path = $short_filename;
1287 }
[23416]1288 # Make sure initial drive letter is lower-case (to fit in
1289 # with rest of Greenstone)
[23483]1290 $filename_full_path =~ s/^(.):/\u$1:/;
[23388]1291 }
1292
1293 return $filename_full_path;
1294}
1295
[23561]1296sub block_filename
1297{
1298 my ($block_hash,$filename) = @_;
1299
1300 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
1301
1302 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
1303 my $lower_filename = lc($filename);
1304 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
1305# my $lower_drive = $filename;
1306# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
1307
1308# my $upper_drive = $filename;
1309# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
1310#
1311# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
1312# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
1313 }
1314 else {
1315 $block_hash->{'file_blocks'}->{$filename} = 1;
1316 }
1317}
[23388]1318
[23561]1319
[18441]1320sub filename_is_absolute
1321{
1322 my ($filename) = @_;
1323
1324 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1325 return ($filename =~ m/^(\w:)?\\/);
1326 }
1327 else {
1328 return ($filename =~ m/^\//);
1329 }
1330}
1331
1332
[17572]1333## @method make_absolute()
1334#
1335# Ensure the given file path is absolute in respect to the given base path.
1336#
1337# @param $base_dir A string denoting the base path the given dir must be
1338# absolute to.
1339# @param $dir The directory to be made absolute as a string. Note that the
1340# dir may already be absolute, in which case it will remain
1341# unchanged.
1342# @return The now absolute form of the directory as a string.
1343#
1344# @author John Thompson, DL Consulting Ltd.
1345# @copy 2006 DL Consulting Ltd.
1346#
1347#used in buildcol.pl, doesn't work for all cases --kjdon
1348sub make_absolute {
1349
1350 my ($base_dir, $dir) = @_;
[18441]1351### print STDERR "dir = $dir\n";
[17572]1352 $dir =~ s/[\\\/]+/\//g;
1353 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
1354 $dir =~ s|^/tmp_mnt||;
1355 1 while($dir =~ s|/[^/]*/\.\./|/|g);
1356 $dir =~ s|/[.][.]?/|/|g;
1357 $dir =~ tr|/|/|s;
[18441]1358### print STDERR "dir = $dir\n";
[17572]1359
1360 return $dir;
1361}
1362## make_absolute() ##
[10281]1363
[7929]1364sub get_dirsep {
1365
1366 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1367 return "\\";
1368 } else {
1369 return "\/";
1370 }
1371}
1372
[619]1373sub get_os_dirsep {
[4]1374
[619]1375 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1376 return "\\\\";
1377 } else {
1378 return "\\\/";
1379 }
1380}
1381
1382sub get_re_dirsep {
1383
1384 return "\\\\|\\\/";
1385}
1386
1387
[15003]1388sub get_dirsep_tail {
1389 my ($filename) = @_;
1390
1391 # returns last part of directory or filename
1392 # On unix e.g. a/b.d => b.d
1393 # a/b/c => c
1394
[15088]1395 my $dirsep = get_re_dirsep();
1396 my @dirs = split (/$dirsep/, $filename);
1397 my $tail = pop @dirs;
[15003]1398
[15088]1399 # - caused problems under windows
1400 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1401
[15003]1402 return $tail;
1403}
1404
1405
[4]1406# if this is running on windows we want binaries to end in
1407# .exe, otherwise they don't have to end in any extension
1408sub get_os_exe {
1409 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
1410 return "";
1411}
1412
1413
[86]1414# test to see whether this is a big or little endian machine
[15713]1415sub is_little_endian
1416{
1417 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1418 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1419 # Otherwise, it's little endian
1420
1421 #return 0 if $^O =~ /^darwin$/i;
[17714]1422 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1423
1424 # Going back to stating exactly whether the machine is little endian
1425 # or big endian, without any special case for Macs. Since for rata it comes
1426 # back with little endian and for shuttle with bigendian.
[15713]1427 return (ord(substr(pack("s",1), 0, 1)) == 1);
[86]1428}
[4]1429
[86]1430
[135]1431# will return the collection name if successful, "" otherwise
1432sub use_collection {
[1454]1433 my ($collection, $collectdir) = @_;
[135]1434
[1454]1435 if (!defined $collectdir || $collectdir eq "") {
1436 $collectdir = &filename_cat ($ENV{'GSDLHOME'}, "collect");
1437 }
1438
[135]1439 # get and check the collection
1440 if (!defined($collection) || $collection eq "") {
1441 if (defined $ENV{'GSDLCOLLECTION'}) {
1442 $collection = $ENV{'GSDLCOLLECTION'};
1443 } else {
[2359]1444 print STDOUT "No collection specified\n";
[135]1445 return "";
1446 }
1447 }
1448
1449 if ($collection eq "modelcol") {
[2359]1450 print STDOUT "You can't use modelcol.\n";
[135]1451 return "";
1452 }
1453
1454 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1455 # are defined
[17204]1456 $ENV{'GSDLCOLLECTION'} = $collection;
[1454]1457 $ENV{'GSDLCOLLECTDIR'} = &filename_cat ($collectdir, $collection);
[135]1458
1459 # make sure this collection exists
1460 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
[2359]1461 print STDOUT "Invalid collection ($collection).\n";
[135]1462 return "";
1463 }
1464
1465 # everything is ready to go
1466 return $collection;
1467}
1468
[21207]1469sub get_current_collection_name {
1470 return $ENV{'GSDLCOLLECTION'};
1471}
[14926]1472
1473
1474# will return the collection name if successful, "" otherwise.
1475# Like use_collection (above) but for greenstone 3 (taking account of site level)
1476
1477sub use_site_collection {
1478 my ($site, $collection, $collectdir) = @_;
1479
1480 if (!defined $collectdir || $collectdir eq "") {
1481 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1482 $collectdir = &filename_cat ($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1483 }
1484
1485 # collectdir explicitly set by this point (using $site variable if required).
1486 # Can call "old" gsdl2 use_collection now.
1487
1488 return use_collection($collection,$collectdir);
1489}
1490
1491
1492
[15018]1493sub locate_config_file
1494{
1495 my ($file) = @_;
1496
1497 my $locations = locate_config_files($file);
1498
1499 return shift @$locations; # returns undef if 'locations' is empty
1500}
1501
1502
1503sub locate_config_files
1504{
1505 my ($file) = @_;
1506
1507 my @locations = ();
1508
1509 if (-e $file) {
1510 # Clearly specified (most likely full filename)
1511 # No need to hunt in 'etc' directories, return value unchanged
1512 push(@locations,$file);
1513 }
1514 else {
1515 # Check for collection specific one before looking in global GSDL 'etc'
[16969]1516 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1517 my $test_collect_etc_filename
1518 = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1519
1520 if (-e $test_collect_etc_filename) {
1521 push(@locations,$test_collect_etc_filename);
1522 }
[15018]1523 }
1524 my $test_main_etc_filename
1525 = &util::filename_cat($ENV{'GSDLHOME'},"etc", $file);
1526 if (-e $test_main_etc_filename) {
1527 push(@locations,$test_main_etc_filename);
1528 }
1529 }
1530
1531 return \@locations;
1532}
1533
1534
[9955]1535sub hyperlink_text
1536{
1537 my ($text) = @_;
1538
1539 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1540 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1541
1542 return $text;
1543}
1544
1545
[16436]1546# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1547# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1548sub is_dir_empty
1549{
1550 my ($path) = @_;
1551 opendir DIR, $path;
1552 while(my $entry = readdir DIR) {
1553 next if($entry =~ /^\.\.?$/);
1554 closedir DIR;
1555 return 0;
1556 }
1557 closedir DIR;
1558 return 1;
1559}
1560
[18337]1561# Returns the given filename converted using either URL encoding or base64
1562# encoding, as specified by $rename_method. If the given filename has no suffix
[20413]1563# (if it is just the tailname), then $no_suffix should be some defined value.
1564# rename_method can be url, none, base64
[18319]1565sub rename_file {
[18337]1566 my ($filename, $rename_method, $no_suffix) = @_;
[18329]1567
[18337]1568 if(!$filename) { # undefined or empty string
[18329]1569 return $filename;
1570 }
[18319]1571
[20413]1572 if (!$rename_method) {
1573 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1574 # Debugging information
[22856]1575 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1576 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
[20413]1577 $rename_method = "url";
1578 } elsif($rename_method eq "none") {
1579 return $filename; # would have already been renamed
1580 }
1581
[19762]1582 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1583 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1584 ###$filename =~ s/ /_/g;
[18337]1585
1586 my ($tailname,$dirname,$suffix);
1587 if($no_suffix) { # given a tailname, no suffix
1588 ($tailname,$dirname) = File::Basename::fileparse($filename);
1589 }
1590 else {
1591 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1592 }
[23388]1593 if (!$suffix) {
1594 $suffix = "";
1595 }
1596 else {
1597 $suffix = lc($suffix);
1598 }
[18337]1599
[20413]1600 if ($rename_method eq "url") {
[18319]1601 $tailname = &unicode::url_encode($tailname);
1602 }
1603 elsif ($rename_method eq "base64") {
[18341]1604 $tailname = &unicode::base64_encode($tailname);
[18319]1605 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1606 }
[18326]1607
[18319]1608 $filename = "$tailname$suffix";
[18326]1609 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
[18319]1610
1611 return $filename;
1612}
1613
[21616]1614
1615# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
[21664]1616sub rename_ldb_or_bdb_file {
[18657]1617 my ($filename_no_ext) = @_;
1618
1619 my $new_filename = "$filename_no_ext.gdb";
[21615]1620 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
[18657]1621 # try ldb
1622 my $old_filename = "$filename_no_ext.ldb";
1623
1624 if (-f $old_filename) {
[19056]1625 print STDERR "Renaming $old_filename to $new_filename\n";
1626 rename ($old_filename, $new_filename)
1627 || print STDERR "Rename failed: $!\n";
[18657]1628 return;
1629 }
1630 # try bdb
1631 $old_filename = "$filename_no_ext.bdb";
1632 if (-f $old_filename) {
[19056]1633 print STDERR "Renaming $old_filename to $new_filename\n";
1634 rename ($old_filename, $new_filename)
1635 || print STDERR "Rename failed: $!\n";
[18657]1636 return;
1637 }
1638}
1639
[24874]1640sub os_dir() {
1641
1642 my $gsdlarch = "";
1643 if(defined $ENV{'GSDLARCH'}) {
1644 $gsdlarch = $ENV{'GSDLARCH'};
1645 }
1646 return $ENV{'GSDLOS'}.$gsdlarch;
1647}
[18657]1648
[21719]1649# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1650# By default, /greenstone3 for GS3 or /greenstone for GS2.
1651sub get_greenstone_url_prefix() {
1652 # if already set on a previous occasion, just return that
1653 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1654 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
[18657]1655
[21719]1656 my ($configfile, $urlprefix, $defaultUrlprefix);
1657 my @propertynames = ();
1658
1659 if($ENV{'GSDL3SRCHOME'}) {
1660 $defaultUrlprefix = "/greenstone3";
1661 $configfile = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1662 push(@propertynames, qw/path\s*\=/);
1663 } else {
1664 $defaultUrlprefix = "/greenstone";
[24874]1665 $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
[21719]1666 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1667 }
1668
1669 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1670
1671 if(!$urlprefix) { # no values found for URL prefix, use default values
1672 $urlprefix = $defaultUrlprefix;
1673 } else {
1674 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1675 $urlprefix =~ s/^\///; # remove the starting slash
1676 my @dirs = split(/(\\|\/)/, $urlprefix);
1677 $urlprefix = shift(@dirs);
1678
1679 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1680 $urlprefix = "/$urlprefix";
1681 }
1682 }
1683
1684 # set for the future
1685 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1686# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1687 return $urlprefix;
1688}
1689
1690
1691# Given a config file (xml or java properties file) and a list/array of regular expressions
1692# that represent property names to match on, this function will return the value for the 1st
1693# matching property name. If the return value is undefined, no matching property was found.
1694sub extract_propvalue_from_file() {
1695 my ($configfile, $propertynames) = @_;
1696
1697 my $value;
1698 unless(open(FIN, "<$configfile")) {
1699 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1700 return $value; # not initialised
1701 }
1702
1703 # Read the entire file at once, as one single line, then close it
1704 my $filecontents;
1705 {
1706 local $/ = undef;
1707 $filecontents = <FIN>;
1708 }
1709 close(FIN);
1710
1711 foreach my $regex (@$propertynames) {
1712 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1713 if($value) {
1714 $value =~ s/^\"//; # remove any startquotes
1715 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1716 last; # found value for a matching property, break from loop
1717 }
1718 }
1719
1720 return $value;
1721}
1722
[23306]1723# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1724# given that perllib is in @INC in order to invoke this subroutine.
1725# Call as follows -- after setting up INC to include perllib and
1726# after setting up GSDLHOME and GSDLOS:
1727#
1728# require util;
1729# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1730#
1731sub setup_greenstone_env() {
1732 my ($GSDLHOME, $GSDLOS) = @_;
1733
1734 #my %env_map = ();
1735 # Get the localised ENV settings of running a localised source setup.bash
[23314]1736 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1737 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1738 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
[23306]1739 if($GSDLOS =~ m/windows/i) {
[23314]1740 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1741 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
[23306]1742 }
1743 if (!open(PIN, "$perl_command |")) {
1744 print STDERR ("Unable to execute command: $perl_command. $!\n");
[24563]1745 }
[23306]1746
1747 while (defined (my $perl_output_line = <PIN>)) {
1748 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1749 #$env_map{$key}=$value;
1750 $ENV{$key}=$value;
1751 }
[24563]1752 close (PIN);
1753
[23306]1754 # If any keys in $ENV don't occur in Greenstone's localised env
1755 # (stored in $env_map), delete those entries from $ENV
1756 #foreach $key (keys %ENV) {
1757 # if(!defined $env_map{$key}) {
1758 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1759 # delete $ENV{$key}; # del $ENV(key, value) pair
1760 # }
1761 #}
1762 #undef %env_map;
1763}
1764
[24362]1765sub get_perl_exec() {
1766 my $perl_exec = $^X; # may return just "perl"
1767
1768 if($ENV{'PERLPATH'}) {
1769 # OR: # $perl_exec = &util::filename_cat($ENV{'PERLPATH'},"perl");
1770 if($ENV{'GSDLOS'} =~ m/windows/) {
1771 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1772 } else {
1773 $perl_exec = "$ENV{'PERLPATH'}/perl";
1774 }
1775 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1776 # containing the full path to the current perl executable we're using
1777 $perl_exec = $Config{perlpath}; # configured path for perl
1778 if (!-e $perl_exec) { # may not point to location on this machine
1779 $perl_exec = $^X; # may return just "perl"
1780 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1781 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1782 }
1783 }
1784 }
1785
1786 return $perl_exec;
1787}
1788
[25533]1789# returns the path to the java command in the JRE included with GS (if any),
1790# quoted to safeguard any spaces in this path, otherwise a simple java
1791# command is returned which assumes and will try for a system java.
[25512]1792sub get_java_command {
1793 my $java = "java";
1794 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1795 # after running setup.bat or from GLI which also runs setup.bat
1796 my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
1797 if(-d $java_bin) {
1798 $java = &util::filename_cat($java_bin,"java");
[25533]1799 $java = "\"".$java."\""; # quoted to preserve spaces in path
[25512]1800 }
1801 }
1802 return $java;
1803}
[24362]1804
[25512]1805
[25577]1806# Given the qualified collection name (colgroup/collection),
1807# returns the collection and colgroup parts
1808sub get_collection_parts {
1809 # http://perldoc.perl.org/File/Basename.html
1810 # my($filename, $directories, $suffix) = fileparse($path);
1811 # "$directories contains everything up to and including the last directory separator in the $path
1812 # including the volume (if applicable). The remainder of the $path is the $filename."
1813 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1814
1815 my $qualified_collection = shift(@_);
1816
1817 # Since activate.pl can be launched from the command-line, including by a user,
1818 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1819 # Also allow for the accidental inclusion of multiple slashes
1820 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1821
1822 if(!defined $collection) {
1823 $collection = $colgroup;
1824 $colgroup = "";
1825 }
1826 return ($collection, $colgroup);
1827}
1828
1829# work out the "collectdir/collection" location
1830sub resolve_collection_dir {
1831 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1832
1833 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1834
1835 if (defined $collect_dir) {
1836 return &util::filename_cat($collect_dir,$colgroup, $collection);
1837 }
[25796]1838 elsif (defined($ENV{'GSDLCOLLECTDIR'})) {
1839 return $ENV{'GSDLCOLLECTDIR'};
1840 }
[25577]1841 else {
1842 if (defined $site) {
1843 return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
1844 }
1845 else {
1846 return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
1847 }
1848 }
1849}
1850
[25994]1851# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1852# a directory containing sequentially numbered images.
1853sub create_itemfile
1854{
1855 my ($output_dir, $convert_basename, $convert_to) = @_;
1856 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1857
1858 my $page_num = "";
1859 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1860
1861 # Sort files in the directory by page_num
1862 sub page_number {
1863 my ($dir) = @_;
1864 my ($pagenum) =($dir =~ m/^.*[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1865
1866 $pagenum = 1 unless defined $pagenum;
1867 return $pagenum;
1868 }
1869
1870 # sort the files in the directory in the order of page_num rather than lexically.
1871 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1872
1873 # work out if the numbering of the now sorted image files starts at 0 or not
1874 # by checking the number of the first _image_ file (skipping item files)
1875 my $starts_at_0 = 0;
1876 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1877 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1878 $starts_at_0 = 1;
1879 }
1880
1881 my $item_file = &util::filename_cat($output_dir, $convert_basename.".item");
1882 open(FILE,">$item_file");
1883 print FILE "<PagedDocument>\n";
1884
1885 foreach my $file (@dir_files){
1886 if ($file !~ /\.item/i){
1887 $page_num = page_number($file);
1888 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1889 print FILE " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1890 }
1891 }
1892
1893 print FILE "</PagedDocument>\n";
[26017]1894 close FILE;
[25994]1895 closedir DIR;
1896 return $item_file;
1897}
1898
[4]18991;
Note: See TracBrowser for help on using the repository browser.