root/main/trunk/greenstone2/perllib/util.pm @ 26017

Revision 26017, 54.3 KB (checked in by ak19, 8 years ago)

Thanks to Kathy: File handle to item file generated by pagedimgplug needs to be closed, now that the code's moved to util. Previously a pl script called it from main and when main exited the file handle would have got closed automatically. Now that the code to create the item file is also invoked by the pdfbox ext (to extract images) and since that doesn't exit, the file handle needs to be closed. With this commit, PDFBox is able to extract images from PDFs when -pagedimg is turned on. But we found it doesn't convert PDFs that contain text into pages of images.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29
30use Encode;
31use File::Copy;
32use File::Basename;
33# Config for getting the perlpath in the recommended way, though it uses paths that are
34# hard-coded into the Config file that's generated upon configuring and compiling perl.
35# $^X works better in some cases to return the path to perl used to launch the script,
36# but if launched with plain "perl" (no full-path), that will be just what it returns.
37use Config;
38
39# removes files (but not directories)
40sub rm {
41    my (@files) = @_;
42
43    my @filefiles = ();
44
45    # make sure the files we want to delete exist
46    # and are regular files
47    foreach my $file (@files) {
48    if (!-e $file) {
49        print STDERR "util::rm $file does not exist\n";
50    } elsif ((!-f $file) && (!-l $file)) {
51        print STDERR "util::rm $file is not a regular (or symbolic) file\n";
52    } else {
53        push (@filefiles, $file);
54    }
55    }
56   
57    # remove the files
58    my $numremoved = unlink @filefiles;
59
60    # check to make sure all of them were removed
61    if ($numremoved != scalar(@filefiles)) {
62    print STDERR "util::rm Not all files were removed\n";
63    }
64}
65
66# removes files (but not directories) - can rename this to the default
67# "rm" subroutine  when debugging the deletion of individual files.
68sub rm_debug {
69    my (@files) = @_;
70    my @filefiles = ();
71
72    # make sure the files we want to delete exist
73    # and are regular files
74    foreach my $file (@files) {
75        if (!-e $file) {
76            print STDERR "util::rm $file does not exist\n";
77        } elsif ((!-f $file) && (!-l $file)) {
78            print STDERR "util::rm $file is not a regular (or symbolic) file\n";
79        } else { # debug message
80            unlink($file) or warn "Could not delete file $file: $!\n";
81        }
82    }
83}
84
85
86# recursive removal
87sub filtered_rm_r {
88    my ($files,$file_accept_re,$file_reject_re) = @_;
89
90#   my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(2);
91#   my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
92#   print STDERR "** Calling method (2): $lcfilename:$cline $cpackage->$csubr\n";
93   
94    my @files_array = (ref $files eq "ARRAY") ? @$files : ($files);
95
96    # recursively remove the files
97    foreach my $file (@files_array) {
98    $file =~ s/[\/\\]+$//; # remove trailing slashes
99   
100    if (!-e $file) {
101        print STDERR "util::filtered_rm_r $file does not exist\n";
102
103    } elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link
104        # get the contents of this directory
105        if (!opendir (INDIR, $file)) {
106        print STDERR "util::filtered_rm_r could not open directory $file\n";
107        } else {
108        my @filedir = grep (!/^\.\.?$/, readdir (INDIR));
109        closedir (INDIR);
110               
111        # remove all the files in this directory
112        map {$_="$file/$_";} @filedir;
113        &filtered_rm_r (\@filedir,$file_accept_re,$file_reject_re);
114
115        if (!defined $file_accept_re && !defined $file_reject_re) {
116            # remove this directory
117            if (!rmdir $file) {
118            print STDERR "util::filtered_rm_r couldn't remove directory $file\n";
119            }
120        }
121        }
122    } else {
123        next if (defined $file_reject_re && ($file =~ m/$file_reject_re/));
124
125        if ((!defined $file_accept_re) || ($file =~ m/$file_accept_re/)) {
126        # remove this file 
127        &rm ($file);
128        }
129    }
130    }
131}
132
133
134# recursive removal
135sub rm_r {
136    my (@files) = @_;
137   
138    # use the more general (but reterospectively written function
139    # filtered_rm_r function()
140
141    filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
142}
143
144
145
146
147# moves a file or a group of files
148sub mv {
149    my $dest = pop (@_);
150    my (@srcfiles) = @_;
151
152    # remove trailing slashes from source and destination files
153    $dest =~ s/[\\\/]+$//;
154    map {$_ =~ s/[\\\/]+$//;} @srcfiles;
155
156    # a few sanity checks
157    if (scalar (@srcfiles) == 0) {
158    print STDERR "util::mv no destination directory given\n";
159    return;
160    } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
161    print STDERR "util::mv if multiple source files are given the ".
162        "destination must be a directory\n";
163    return;
164    }
165
166    # move the files
167    foreach my $file (@srcfiles) {
168    my $tempdest = $dest;
169    if (-d $tempdest) {
170        my ($filename) = $file =~ /([^\\\/]+)$/;
171        $tempdest .= "/$filename";
172    }
173    if (!-e $file) {
174        print STDERR "util::mv $file does not exist\n";
175    } else {
176        if(!rename ($file, $tempdest)) {
177        print STDERR "**** Failed to rename $file to $tempdest\n";
178        &File::Copy::copy($file, $tempdest);
179        &rm($file);
180        }
181        elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
182        #print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
183        if(!-e $tempdest) {
184            print STDERR "@@@@ ERROR: $tempdest does not exist\n";
185        }
186        # Sometimes the rename operation fails (as does File::Copy::move).
187        # This turns out to be because the files are hardlinked.
188        # Need to do a copy-delete in this case, however, the copy step is not necessary:
189        # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
190        #&File::Copy::copy($file, $tempdest);
191
192        &rm($file);     
193        }
194    }
195    }
196}
197
198# Move the contents of source directory into target directory
199# (as opposed to merely replacing target dir with the src dir)
200# This can overwrite any files with duplicate names in the target
201# but other files and folders in the target will continue to exist
202sub mv_dir_contents {
203    my ($src_dir, $dest_dir) = @_;
204   
205    # Obtain listing of all files within src_dir
206    # Note that readdir lists relative paths, as well as . and ..
207    opendir(DIR, "$src_dir");
208    my @files= readdir(DIR);
209    close(DIR);
210   
211    my @full_path_files = ();
212    foreach my $file (@files) {
213        # process all except . and ..
214        unless($file eq "." || $file eq "..") {
215           
216            my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
217       
218            # construct absolute paths
219            $file = &filename_cat($src_dir, $file); # $file is now an absolute path
220           
221            # Recurse on directories which have an equivalent in target dest_dir
222            # If $file is a directory that already exists in target $dest_dir,
223            # then a simple move operation will fail (definitely on Windows).
224            if(-d $file && -d $dest_subdir) {
225                #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
226                &mv_dir_contents($file, $dest_subdir);
227               
228                # now all content is moved across, delete empty dir in source folder
229                if(&is_dir_empty($file)) {
230                    if (!rmdir $file) {
231                        print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
232                    }
233                } else { # error
234                    print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
235                }
236            } else { # process files and any directories that don't already exist with a simple move
237                push(@full_path_files, $file);
238            }           
239        }
240    }
241   
242    if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
243        &mk_dir($dest_dir);
244    }
245
246    #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n";
247
248    if(@full_path_files) { # if non-empty, there's something to copy across
249        &mv(@full_path_files, $dest_dir);
250    }
251}
252
253
254# copies a file or a group of files
255sub cp {
256    my $dest = pop (@_);
257    my (@srcfiles) = @_;
258
259    # remove trailing slashes from source and destination files
260    $dest =~ s/[\\\/]+$//;
261    map {$_ =~ s/[\\\/]+$//;} @srcfiles;
262
263    # a few sanity checks
264    if (scalar (@srcfiles) == 0) {
265    print STDERR "util::cp no destination directory given\n";
266    return;
267    } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
268    print STDERR "util::cp if multiple source files are given the ".
269        "destination must be a directory\n";
270    return;
271    }
272
273    # copy the files
274    foreach my $file (@srcfiles) {
275    my $tempdest = $dest;
276    if (-d $tempdest) {
277        my ($filename) = $file =~ /([^\\\/]+)$/;
278        $tempdest .= "/$filename";
279    }
280    if (!-e $file) {
281        print STDERR "util::cp $file does not exist\n";
282    } elsif (!-f $file) {
283        print STDERR "util::cp $file is not a plain file\n";
284    } else {
285        &File::Copy::copy ($file, $tempdest);
286    }
287    }
288}
289
290
291
292# recursively copies a file or group of files
293# syntax: cp_r (sourcefiles, destination directory)
294# destination must be a directory - to copy one file to
295# another use cp instead
296sub cp_r {
297    my $dest = pop (@_);
298    my (@srcfiles) = @_;
299
300    # a few sanity checks
301    if (scalar (@srcfiles) == 0) {
302    print STDERR "util::cp_r no destination directory given\n";
303    return;
304    } elsif (-f $dest) {
305    print STDERR "util::cp_r destination must be a directory\n";
306    return;
307    }
308   
309    # create destination directory if it doesn't exist already
310    if (! -d $dest) {
311    my $store_umask = umask(0002);
312    mkdir ($dest, 0777);
313    umask($store_umask);
314    }
315
316    # copy the files
317    foreach my $file (@srcfiles) {
318
319    if (!-e $file) {
320        print STDERR "util::cp_r $file does not exist\n";
321
322    } elsif (-d $file) {
323        # make the new directory
324        my ($filename) = $file =~ /([^\\\/]*)$/;
325        $dest = &util::filename_cat ($dest, $filename);
326        my $store_umask = umask(0002);
327        mkdir ($dest, 0777);
328        umask($store_umask);
329
330        # get the contents of this directory
331        if (!opendir (INDIR, $file)) {
332        print STDERR "util::cp_r could not open directory $file\n";
333        } else {
334        my @filedir = readdir (INDIR);
335        closedir (INDIR);
336        foreach my $f (@filedir) {
337            next if $f =~ /^\.\.?$/;
338            # copy all the files in this directory
339            my $ff = &util::filename_cat ($file, $f);
340            &cp_r ($ff, $dest);
341        }
342        }
343
344    } else {
345        &cp($file, $dest);
346    }
347    }
348}
349# recursively copies a file or group of files
350# syntax: cp_r (sourcefiles, destination directory)
351# destination must be a directory - to copy one file to
352# another use cp instead
353sub cp_r_nosvn {
354    my $dest = pop (@_);
355    my (@srcfiles) = @_;
356
357    # a few sanity checks
358    if (scalar (@srcfiles) == 0) {
359    print STDERR "util::cp_r no destination directory given\n";
360    return;
361    } elsif (-f $dest) {
362    print STDERR "util::cp_r destination must be a directory\n";
363    return;
364    }
365   
366    # create destination directory if it doesn't exist already
367    if (! -d $dest) {
368    my $store_umask = umask(0002);
369    mkdir ($dest, 0777);
370    umask($store_umask);
371    }
372
373    # copy the files
374    foreach my $file (@srcfiles) {
375
376    if (!-e $file) {
377        print STDERR "util::cp_r $file does not exist\n";
378
379    } elsif (-d $file) {
380        # make the new directory
381        my ($filename) = $file =~ /([^\\\/]*)$/;
382        $dest = &util::filename_cat ($dest, $filename);
383        my $store_umask = umask(0002);
384        mkdir ($dest, 0777);
385        umask($store_umask);
386
387        # get the contents of this directory
388        if (!opendir (INDIR, $file)) {
389        print STDERR "util::cp_r could not open directory $file\n";
390        } else {
391        my @filedir = readdir (INDIR);
392        closedir (INDIR);
393        foreach my $f (@filedir) {
394            next if $f =~ /^\.\.?$/;
395            next if $f =~ /^\.svn$/;
396            # copy all the files in this directory
397            my $ff = &util::filename_cat ($file, $f);
398            &cp_r ($ff, $dest);
399        }
400        }
401
402    } else {
403        &cp($file, $dest);
404    }
405    }
406}
407
408# copies a directory and its contents, excluding subdirectories, into a new directory
409sub cp_r_toplevel {
410    my $dest = pop (@_);
411    my (@srcfiles) = @_;
412
413    # a few sanity checks
414    if (scalar (@srcfiles) == 0) {
415    print STDERR "util::cp_r no destination directory given\n";
416    return;
417    } elsif (-f $dest) {
418    print STDERR "util::cp_r destination must be a directory\n";
419    return;
420    }
421   
422    # create destination directory if it doesn't exist already
423    if (! -d $dest) {
424    my $store_umask = umask(0002);
425    mkdir ($dest, 0777);
426    umask($store_umask);
427    }
428
429    # copy the files
430    foreach my $file (@srcfiles) {
431
432    if (!-e $file) {
433        print STDERR "util::cp_r $file does not exist\n";
434
435    } elsif (-d $file) {
436        # make the new directory
437        my ($filename) = $file =~ /([^\\\/]*)$/;
438        $dest = &util::filename_cat ($dest, $filename);
439        my $store_umask = umask(0002);
440        mkdir ($dest, 0777);
441        umask($store_umask);
442
443        # get the contents of this directory
444        if (!opendir (INDIR, $file)) {
445        print STDERR "util::cp_r could not open directory $file\n";
446        } else {
447        my @filedir = readdir (INDIR);
448        closedir (INDIR);
449        foreach my $f (@filedir) {
450            next if $f =~ /^\.\.?$/;
451           
452            # copy all the files in this directory, but not directories
453            my $ff = &util::filename_cat ($file, $f);
454            if (-f $ff) {
455            &cp($ff, $dest);
456            #&cp_r ($ff, $dest);
457            }
458        }
459        }
460
461    } else {
462        &cp($file, $dest);
463    }
464    }
465}
466
467sub mk_dir {
468    my ($dir) = @_;
469
470    my $store_umask = umask(0002);
471    my $mkdir_ok = mkdir ($dir, 0777);
472    umask($store_umask);
473   
474    if (!$mkdir_ok)
475    {
476    print STDERR "util::mk_dir could not create directory $dir\n";
477    return;
478    }
479}
480
481# in case anyone cares - I did some testing (using perls Benchmark module)
482# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
483# slightly faster (surprisingly) - Stefan.
484sub mk_all_dir {
485    my ($dir) = @_;
486
487    # use / for the directory separator, remove duplicate and
488    # trailing slashes
489    $dir=~s/[\\\/]+/\//g;
490    $dir=~s/[\\\/]+$//;
491
492    # make sure the cache directory exists
493    my $dirsofar = "";
494    my $first = 1;
495    foreach my $dirname (split ("/", $dir)) {
496    $dirsofar .= "/" unless $first;
497    $first = 0;
498
499    $dirsofar .= $dirname;
500
501    next if $dirname =~ /^(|[a-z]:)$/i;
502    if (!-e $dirsofar)
503        {
504        my $store_umask = umask(0002);
505        my $mkdir_ok = mkdir ($dirsofar, 0777);
506        umask($store_umask);
507        if (!$mkdir_ok)
508        {
509            print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
510            return;
511        }
512        }
513    }
514}
515
516# make hard link to file if supported by OS, otherwise copy the file
517sub hard_link {
518    my ($src, $dest, $verbosity) = @_;
519
520    # remove trailing slashes from source and destination files
521    $src =~ s/[\\\/]+$//;
522    $dest =~ s/[\\\/]+$//;
523
524##    print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
525    # a few sanity checks
526    if (-e $dest) {
527    # destination file already exists
528    return;
529    }
530    elsif (!-e $src) {
531    print STDERR "util::hard_link source file \"$src\" does not exist\n";
532    return 1;
533    }
534    elsif (-d $src) {
535    print STDERR "util::hard_link source \"$src\" is a directory\n";
536    return 1;
537    }
538
539    my $dest_dir = &File::Basename::dirname($dest);
540    mk_all_dir($dest_dir) if (!-e $dest_dir);
541
542
543    if (!link($src, $dest)) {
544    if ((!defined $verbosity) || ($verbosity>2)) {
545        print STDERR "util::hard_link: unable to create hard link. ";
546        print STDERR " Copying file: $src -> $dest\n";
547    }
548    &File::Copy::copy ($src, $dest);
549    }
550    return 0;
551}
552
553# make soft link to file if supported by OS, otherwise copy file
554sub soft_link {
555    my ($src, $dest, $ensure_paths_absolute) = @_;
556
557    # remove trailing slashes from source and destination files
558    $src =~ s/[\\\/]+$//;
559    $dest =~ s/[\\\/]+$//;
560
561    # Ensure file paths are absolute IF requested to do so
562    # Soft_linking didn't work for relative paths
563    if(defined $ensure_paths_absolute && $ensure_paths_absolute) {
564    # We need to ensure that the src file is the absolute path
565    # See http://perldoc.perl.org/File/Spec.html
566    if(!File::Spec->file_name_is_absolute( $src ))  { # it's relative
567        $src = File::Spec->rel2abs($src); # make absolute
568    }
569    # Might as well ensure that the destination file's absolute path is used
570    if(!File::Spec->file_name_is_absolute( $dest )) {
571        $dest = File::Spec->rel2abs($dest); # make absolute
572    }
573    }
574
575    # a few sanity checks
576    if (!-e $src) {
577    print STDERR "util::soft_link source file $src does not exist\n";
578    return 0;
579    }
580
581    my $dest_dir = &File::Basename::dirname($dest);
582    mk_all_dir($dest_dir) if (!-e $dest_dir);
583
584    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
585       
586    # symlink not supported on windows
587    &File::Copy::copy ($src, $dest);
588
589    } elsif (!eval {symlink($src, $dest)}) {
590    print STDERR "util::soft_link: unable to create soft link.\n";
591    return 0;
592    }
593
594    return 1;
595}
596
597# Primarily for filenames generated by processing
598# content of HTML files (which are mapped to UTF-8 internally)
599#
600# To turn this into an octet string that really exists on the file
601# system:
602# 1. don't need to do anything special for Unix-based systems
603#   (as underlying file system is byte-code)
604# 2. need to map to short DOS filenames for Windows
605
606sub utf8_to_real_filename
607{
608    my ($utf8_filename) = @_;
609
610    my $real_filename;
611
612    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
613    require Win32;
614
615    print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
616
617    my $unicode_filename = decode("utf8",$utf8_filename);
618    $real_filename = Win32::GetShortPathName($unicode_filename);
619    }
620    else {
621    $real_filename = $utf8_filename;
622    }
623
624    return $real_filename;
625}
626
627
628sub fd_exists
629{
630    my $filename_full_path = shift @_;
631    my $test_op = shift @_ || "-e";
632
633    # By default tests for existance of file or directory (-e)
634    # Can be made more specific by providing second parameter (e.g. -f or -d)
635
636    my $exists = 0;
637
638    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
639    require Win32;
640    my $filename_short_path = Win32::GetShortPathName($filename_full_path);
641    if (!defined $filename_short_path) {
642        # Was probably still in UTF8 form (not what is needed on Windows)
643        my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
644        if (defined $unicode_filename_full_path) {
645        $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
646        }
647    }
648    $filename_full_path = $filename_short_path;
649    }
650
651    if (defined $filename_full_path) {
652    $exists = eval "($test_op \$filename_full_path)";
653    }
654
655    return $exists;
656}
657
658sub file_exists
659{
660    my ($filename_full_path) = @_;
661
662    return fd_exists($filename_full_path,"-f");
663}
664
665sub dir_exists
666{
667    my ($filename_full_path) = @_;
668
669    return fd_exists($filename_full_path,"-d");
670}
671
672
673
674# updates a copy of a directory in some other part of the filesystem
675# verbosity settings are: 0=low, 1=normal, 2=high
676# both $fromdir and $todir should be absolute paths
677sub cachedir {
678    my ($fromdir, $todir, $verbosity) = @_;
679    $verbosity = 1 unless defined $verbosity;
680
681    # use / for the directory separator, remove duplicate and
682    # trailing slashes
683    $fromdir=~s/[\\\/]+/\//g;
684    $fromdir=~s/[\\\/]+$//;
685    $todir=~s/[\\\/]+/\//g;
686    $todir=~s/[\\\/]+$//;
687
688    &mk_all_dir ($todir);
689
690    # get the directories in ascending order
691    if (!opendir (FROMDIR, $fromdir)) {
692    print STDERR "util::cachedir could not read directory $fromdir\n";
693    return;
694    }
695    my @fromdir = grep (!/^\.\.?$/, sort(readdir (FROMDIR)));
696    closedir (FROMDIR);
697
698    if (!opendir (TODIR, $todir)) {
699    print STDERR "util::cacedir could not read directory $todir\n";
700    return;
701    }
702    my @todir = grep (!/^\.\.?$/, sort(readdir (TODIR)));
703    closedir (TODIR);
704
705    my $fromi = 0;
706    my $toi = 0;
707           
708    while ($fromi < scalar(@fromdir) || $toi < scalar(@todir)) {
709#   print "fromi: $fromi toi: $toi\n";
710
711    # see if we should delete a file/directory
712    # this should happen if the file/directory
713    # is not in the from list or if its a different
714    # size, or has an older timestamp
715    if ($toi < scalar(@todir)) {
716        if (($fromi >= scalar(@fromdir)) ||
717        ($todir[$toi] lt $fromdir[$fromi] ||
718         ($todir[$toi] eq $fromdir[$fromi] &&
719          &differentfiles("$fromdir/$fromdir[$fromi]","$todir/$todir[$toi]",
720                  $verbosity)))) {
721
722        # the files are different
723        &rm_r("$todir/$todir[$toi]");
724        splice(@todir, $toi, 1); # $toi stays the same
725
726        } elsif ($todir[$toi] eq $fromdir[$fromi]) {
727        # the files are the same
728        # if it is a directory, check its contents
729        if (-d "$todir/$todir[$toi]") {
730            &cachedir ("$fromdir/$fromdir[$fromi]",
731                   "$todir/$todir[$toi]", $verbosity);
732        }
733
734        $toi++;
735        $fromi++;
736        next;
737        }
738    }
739 
740    # see if we should insert a file/directory
741    # we should insert a file/directory if there
742    # is no tofiles left or if the tofile does not exist
743    if ($fromi < scalar(@fromdir) && ($toi >= scalar(@todir) ||
744                      $todir[$toi] gt $fromdir[$fromi])) {
745        &cp_r ("$fromdir/$fromdir[$fromi]", "$todir/$fromdir[$fromi]");
746        splice (@todir, $toi, 0, $fromdir[$fromi]);
747
748        $toi++;
749        $fromi++;
750    }
751    }
752}
753
754# this function returns -1 if either file is not found
755# assumes that $file1 and $file2 are absolute file names or
756# in the current directory
757# $file2 is allowed to be newer than $file1
758sub differentfiles {
759    my ($file1, $file2, $verbosity) = @_;
760    $verbosity = 1 unless defined $verbosity;
761
762    $file1 =~ s/\/+$//;
763    $file2 =~ s/\/+$//;
764   
765    my ($file1name) = $file1 =~ /\/([^\/]*)$/;
766    my ($file2name) = $file2 =~ /\/([^\/]*)$/;
767
768    return -1 unless (-e $file1 && -e $file2);
769    if ($file1name ne $file2name) {
770    print STDERR "filenames are not the same\n" if ($verbosity >= 2);
771    return 1;
772    }
773
774    my @file1stat = stat ($file1);
775    my @file2stat = stat ($file2);
776
777    if (-d $file1) {
778    if (! -d $file2) {
779        print STDERR "one file is a directory\n" if ($verbosity >= 2);
780        return 1;
781    }
782    return 0;
783    }
784
785    # both must be regular files
786    unless (-f $file1 && -f $file2) {
787    print STDERR "one file is not a regular file\n" if ($verbosity >= 2);
788    return 1;
789    }
790
791    # the size of the files must be the same
792    if ($file1stat[7] != $file2stat[7]) {
793    print STDERR "different sized files\n" if ($verbosity >= 2);
794    return 1;
795    }
796
797    # the second file cannot be older than the first
798    if ($file1stat[9] > $file2stat[9]) {
799    print STDERR "file is older\n" if ($verbosity >= 2);
800    return 1;
801    }
802
803    return 0;
804}
805
806
807sub get_tmp_filename
808{
809    my $file_ext = shift(@_) || undef;
810
811    my $opt_dot_file_ext = "";
812    if (defined $file_ext) {
813    if ($file_ext !~ m/\./) {
814        # no dot, so needs one added in at start
815        $opt_dot_file_ext = ".$file_ext"
816    }
817    else {
818        # allow for "extensions" such as _metadata.txt to be handled
819        # gracefully
820        $opt_dot_file_ext = $file_ext;
821    }
822    }
823
824    my $tmpdir = filename_cat($ENV{'GSDLHOME'}, "tmp");
825    &mk_all_dir ($tmpdir) unless -e $tmpdir;
826
827    my $count = 1000;
828    my $rand = int(rand $count);
829    my $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
830
831    while (-e $full_tmp_filename) {
832    $rand = int(rand $count);
833    $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
834    $count++;
835    }
836   
837    return $full_tmp_filename;
838}
839
840sub get_timestamped_tmp_folder
841{
842
843    my $tmp_dirname;
844    if(defined $ENV{'GSDLCOLLECTDIR'}) {
845    $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
846    } elsif(defined $ENV{'GSDLHOME'}) {
847    $tmp_dirname = $ENV{'GSDLHOME'};
848    } else {
849    return undef;
850    }
851
852    $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
853    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
854
855    # add the timestamp into the path otherwise we can run into problems
856    # if documents have the same name
857    my $timestamp = time;
858    my $time_tmp_dirname = &util::filename_cat($tmp_dirname, $timestamp);
859    $tmp_dirname = $time_tmp_dirname;
860    my $i = 1;
861    while (-e $tmp_dirname) {
862    $tmp_dirname = "$time_tmp_dirname$i";
863    $i++;
864    }
865    &util::mk_dir($tmp_dirname);
866 
867    return $tmp_dirname;
868}
869
870sub get_timestamped_tmp_filename_in_collection
871{
872
873    my ($input_filename, $output_ext) = @_;
874    # derive tmp filename from input filename
875    my ($tailname, $dirname, $suffix)
876    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
877
878    # softlink to collection tmp dir
879    my $tmp_dirname = &util::get_timestamped_tmp_folder();
880    $tmp_dirname = $dirname unless defined $tmp_dirname;
881
882    # following two steps copied from ConvertBinaryFile
883    # do we need them?? can't use them as is, as they use plugin methods.
884
885    #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
886
887    # URLEncode this since htmls with images where the html filename is utf8 don't seem
888    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
889    # files on the filesystem.
890    #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
891    if (defined $output_ext) {
892    $output_ext = ".$output_ext"; # add the dot
893    } else {
894    $output_ext = $suffix;
895    }
896    $output_ext= lc($output_ext);
897    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$output_ext");
898   
899    return $tmp_filename;
900}
901
902sub get_toplevel_tmp_dir
903{
904    return filename_cat($ENV{'GSDLHOME'}, "tmp");
905}
906
907
908sub filename_to_regex {
909    my $filename = shift (@_);
910
911    # need to make single backslashes double so that regex works
912    $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);   
913   
914    # note that the first part of a substitution is a regex, so RE chars need to be escaped,
915    # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
916    $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
917    $filename =~ s@\(@\\(@g; # escape brackets
918    $filename =~ s@\)@\\)@g; # escape brackets
919    $filename =~ s@\[@\\[@g; # escape brackets
920    $filename =~ s@\]@\\]@g; # escape brackets
921   
922    return $filename;
923}
924
925sub unregex_filename {
926    my $filename = shift (@_);
927
928    # need to put doubled backslashes for regex back to single
929    $filename =~ s/\\\./\./g; # remove RE syntax for .
930    $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
931    $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
932    $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
933    $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
934   
935    # \\ goes to \
936    # This is the last step in reverse mirroring the order of steps in filename_to_regex()
937    $filename =~ s/\\\\/\\/g; # remove RE syntax for \   
938    return $filename;
939}
940
941sub filename_cat {
942    my $first_file = shift(@_);
943    my (@filenames) = @_;
944
945#   Useful for debugging
946#     -- might make sense to call caller(0) rather than (1)??
947#   my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
948#   print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
949   
950    # If first_file is not null or empty, then add it back into the list
951    if (defined $first_file && $first_file =~ /\S/) {
952    unshift(@filenames, $first_file);
953    }
954
955    my $filename = join("/", @filenames);
956
957    # remove duplicate slashes and remove the last slash
958    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
959    $filename =~ s/[\\\/]+/\\/g;
960    } else {
961    $filename =~ s/[\/]+/\//g;
962    # DB: want a filename abc\de.html to remain like this
963    }
964    $filename =~ s/[\\\/]$//;
965
966    return $filename;
967}
968
969
970sub pathname_cat {
971    my $first_path = shift(@_);
972    my (@pathnames) = @_;
973
974    # If first_path is not null or empty, then add it back into the list
975    if (defined $first_path && $first_path =~ /\S/) {
976    unshift(@pathnames, $first_path);
977    }
978
979    my $join_char;
980    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
981    $join_char = ";";
982    } else {
983    $join_char = ":";
984    }
985
986    my $pathname = join($join_char, @pathnames);
987
988    # remove duplicate slashes
989    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
990    $pathname =~ s/[\\\/]+/\\/g;
991    } else {
992    $pathname =~ s/[\/]+/\//g;
993    # DB: want a pathname abc\de.html to remain like this
994    }
995
996    return $pathname;
997}
998
999
1000sub tidy_up_oid {
1001    my ($OID) = @_;
1002    if ($OID =~ /\./) {
1003    print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
1004    $OID =~ s/\.//g; #remove any periods
1005    }
1006    if ($OID =~ /^\s.*\s$/) {
1007    print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
1008    # remove starting and trailing whitespace
1009    $OID =~ s/^\s+//;
1010    $OID =~ s/\s+$//;
1011    }
1012    if ($OID =~ /^[\d]*$/) {
1013    print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
1014    $OID = "D" . $OID;
1015    }       
1016   
1017    return $OID;
1018}
1019sub envvar_prepend {
1020    my ($var,$val) = @_;
1021
1022    # do not prepend any value/path that's already in the environment variable
1023   
1024    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1025    if (!defined($ENV{$var})) {
1026    $ENV{$var} = "$val";
1027    }
1028    elsif($ENV{$var} !~ m/$escaped_val/) {
1029    $ENV{$var} = "$val;".$ENV{$var};
1030    }
1031}
1032
1033sub envvar_append {
1034    my ($var,$val) = @_;
1035   
1036    # do not append any value/path that's already in the environment variable
1037   
1038    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1039    if (!defined($ENV{$var})) {
1040    $ENV{$var} = "$val";
1041    }
1042    elsif($ENV{$var} !~ m/$escaped_val/) {
1043    $ENV{$var} .= ";$val";
1044    }
1045}
1046
1047
1048# splits a filename into a prefix and a tail extension using the tail_re, or
1049# if that fails, splits on the file_extension . (dot)
1050sub get_prefix_and_tail_by_regex {
1051
1052    my ($filename,$tail_re) = @_;
1053   
1054    my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
1055    if ((!defined $file_prefix) || (!defined $file_ext)) {
1056    ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
1057    }
1058
1059    return ($file_prefix,$file_ext);
1060}
1061
1062# get full path and file only path from a base_dir (which may be empty) and
1063# file (which may contain directories)
1064sub get_full_filenames {
1065    my ($base_dir, $file) = @_;
1066   
1067    my $filename_full_path = $file;
1068    # add on directory if present
1069    $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\S/;
1070   
1071    my $filename_no_path = $file;
1072
1073    # remove directory if present
1074    $filename_no_path =~ s/^.*[\/\\]//;
1075    return ($filename_full_path, $filename_no_path);
1076}
1077
1078# returns the path of a file without the filename -- ie. the directory the file is in
1079sub filename_head {
1080    my $filename = shift(@_);
1081
1082    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1083    $filename =~ s/[^\\\\]*$//;
1084    }
1085    else {
1086    $filename =~ s/[^\\\/]*$//;
1087    }
1088
1089    return $filename;
1090}
1091
1092
1093
1094# returns 1 if filename1 and filename2 point to the same
1095# file or directory
1096sub filenames_equal {
1097    my ($filename1, $filename2) = @_;
1098
1099    # use filename_cat to clean up trailing slashes and
1100    # multiple slashes
1101    $filename1 = filename_cat ($filename1);
1102    $filename2 = filename_cat ($filename2);
1103
1104    # filenames not case sensitive on windows
1105    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1106    $filename1 =~ tr/[A-Z]/[a-z]/;
1107    $filename2 =~ tr/[A-Z]/[a-z]/;
1108    }
1109    return 1 if $filename1 eq $filename2;
1110    return 0;
1111}
1112
1113# If filename is relative to within_dir, returns the relative path of filename to that directory
1114# with slashes in the filename returned as they were in the original (absolute) filename.
1115sub filename_within_directory
1116{
1117    my ($filename,$within_dir) = @_;
1118   
1119    if ($within_dir !~ m/[\/\\]$/) {
1120    my $dirsep = &util::get_dirsep();
1121    $within_dir .= $dirsep;
1122    }
1123   
1124    $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets   
1125    if ($filename =~ m/^$within_dir(.*)$/) {
1126    $filename = $1;
1127    }
1128   
1129    return $filename;
1130}
1131
1132# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
1133# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
1134# The subpath returned will also be a URL type filename.
1135sub filename_within_directory_url_format
1136{
1137    my ($filename,$within_dir) = @_;
1138   
1139    # convert parameters only to / slashes if Windows
1140   
1141    my $filename_urlformat = &filepath_to_url_format($filename);
1142    my $within_dir_urlformat = &filepath_to_url_format($within_dir);
1143
1144    #if ($within_dir_urlformat !~ m/\/$/) {
1145        # make sure directory ends with a slash
1146        #$within_dir_urlformat .= "/";
1147    #}
1148   
1149    my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
1150   
1151    #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
1152   
1153    # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
1154    if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
1155        $filename_urlformat = $1;
1156    }
1157   
1158    return $filename_urlformat;
1159}
1160
1161# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
1162# since on Linux it doesn't represent a file separator but an escape char).
1163sub filepath_to_url_format
1164{
1165    my ($filepath) = @_;
1166    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1167        # Only need to worry about Windows, as Unix style directories already in url-format
1168        # Convert Windows style \ => /
1169        $filepath =~ s@\\@/@g;     
1170    }
1171    return $filepath;
1172}
1173
1174# regex filepaths on windows may include \\ as path separator. Convert \\ to /
1175sub filepath_regex_to_url_format
1176{
1177    my ($filepath) = @_;
1178    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1179    # Only need to worry about Windows, as Unix style directories already in url-format
1180    # Convert Windows style \\ => /
1181    $filepath =~ s@\\\\@/@g;       
1182    }
1183    return $filepath;
1184   
1185}
1186
1187# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
1188# and ignores trailing /
1189# returns (file, dirs) dirs will be empty if no subdirs
1190sub url_fileparse
1191{
1192    my ($filepath) = @_;
1193    # remove trailing /
1194    $filepath =~ s@/$@@;
1195    if ($filepath !~ m@/@) {
1196    return ($filepath, "");
1197    }
1198    my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
1199    return ($file, $dirs);
1200   
1201}
1202
1203
1204sub filename_within_collection
1205{
1206    my ($filename) = @_;
1207
1208    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
1209   
1210    if (defined $collect_dir) {
1211
1212    # if from within GSDLCOLLECTDIR, then remove directory prefix
1213    # so source_filename is realative to it.  This is done to aid
1214    # portability, i.e. the collection can be moved to somewhere
1215    # else on the file system and the archives directory will still
1216    # work.  This is needed, for example in the applet version of
1217    # GLI where GSDLHOME/collect on the server will be different to
1218    # the collect directory of the remove user.  Of course,
1219    # GSDLCOLLECTDIR subsequently needs to be put back on to turn
1220    # it back into a full pathname.
1221
1222    $filename = filename_within_directory($filename,$collect_dir);
1223    }
1224   
1225    return $filename;
1226}
1227
1228sub prettyprint_file
1229{
1230    my ($base_dir,$file,$gli) = @_;
1231
1232    my $filename_full_path = &util::filename_cat($base_dir,$file);
1233
1234    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1235    require Win32;
1236
1237    # For some reason base_dir in the form c:/a/b/c
1238    # This leads to confusion later on, so turn it back into
1239    # the more usual Windows form
1240    $base_dir =~ s/\//\\/g;
1241    my $long_base_dir = Win32::GetLongPathName($base_dir);
1242    my $long_full_path = Win32::GetLongPathName($filename_full_path);
1243
1244    $file = filename_within_directory($long_full_path,$long_base_dir);
1245    $file = encode("utf8",$file) if ($gli);
1246    }
1247
1248    return $file;
1249}
1250
1251
1252sub upgrade_if_dos_filename
1253{
1254    my ($filename_full_path,$and_encode) = @_;
1255
1256    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1257    # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
1258    # to its long (Windows) version
1259    my $long_filename = Win32::GetLongPathName($filename_full_path);
1260    if (defined $long_filename) {
1261        $filename_full_path = $long_filename;
1262    }
1263    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
1264    $filename_full_path =~ s/^(.):/\u$1:/;
1265    if ((defined $and_encode) && ($and_encode)) {
1266        $filename_full_path = encode("utf8",$filename_full_path);
1267    }
1268    }
1269
1270    return $filename_full_path;
1271}
1272
1273
1274sub downgrade_if_dos_filename
1275{
1276    my ($filename_full_path) = @_;
1277
1278    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1279    require Win32;
1280
1281    # Ensure the given long Windows filename is in a form that can
1282    # be opened by Perl => convert it to a short DOS-like filename
1283
1284    my $short_filename = Win32::GetShortPathName($filename_full_path);
1285    if (defined $short_filename) {
1286        $filename_full_path = $short_filename;
1287    }
1288    # Make sure initial drive letter is lower-case (to fit in
1289    # with rest of Greenstone)
1290    $filename_full_path =~ s/^(.):/\u$1:/;
1291    }
1292
1293    return $filename_full_path;
1294}
1295
1296sub block_filename
1297{
1298    my ($block_hash,$filename) = @_;
1299   
1300    if ($ENV{'GSDLOS'} =~ m/^windows$/) {
1301   
1302    # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
1303    my $lower_filename = lc($filename);
1304    $block_hash->{'file_blocks'}->{$lower_filename} = 1;
1305#   my $lower_drive = $filename;
1306#   $lower_drive =~ s/^([A-Z]):/\l$1:/i;
1307   
1308#   my $upper_drive = $filename;
1309#   $upper_drive =~ s/^([A-Z]):/\u$1:/i;
1310#   
1311#   $block_hash->{'file_blocks'}->{$lower_drive} = 1;
1312#   $block_hash->{'file_blocks'}->{$upper_drive} = 1;       
1313    }
1314    else {
1315    $block_hash->{'file_blocks'}->{$filename} = 1;
1316    }
1317}
1318
1319
1320sub filename_is_absolute
1321{
1322    my ($filename) = @_;
1323
1324    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1325    return ($filename =~ m/^(\w:)?\\/);
1326    }
1327    else {
1328    return ($filename =~ m/^\//);
1329    }
1330}
1331
1332
1333## @method make_absolute()
1334#
1335#  Ensure the given file path is absolute in respect to the given base path.
1336#
1337#  @param  $base_dir A string denoting the base path the given dir must be
1338#                    absolute to.
1339#  @param  $dir The directory to be made absolute as a string. Note that the
1340#               dir may already be absolute, in which case it will remain
1341#               unchanged.
1342#  @return The now absolute form of the directory as a string.
1343#
1344#  @author John Thompson, DL Consulting Ltd.
1345#  @copy 2006 DL Consulting Ltd.
1346#
1347#used in buildcol.pl, doesn't work for all cases --kjdon
1348sub make_absolute {
1349   
1350    my ($base_dir, $dir) = @_;
1351###    print STDERR "dir = $dir\n";
1352    $dir =~ s/[\\\/]+/\//g;
1353    $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
1354    $dir =~ s|^/tmp_mnt||;
1355    1 while($dir =~ s|/[^/]*/\.\./|/|g);
1356    $dir =~ s|/[.][.]?/|/|g;
1357    $dir =~ tr|/|/|s;
1358###    print STDERR "dir = $dir\n";
1359   
1360    return $dir;
1361}
1362## make_absolute() ##
1363
1364sub get_dirsep {
1365
1366    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1367    return "\\";
1368    } else {
1369    return "\/";
1370    }
1371}
1372
1373sub get_os_dirsep {
1374
1375    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1376    return "\\\\";
1377    } else {
1378    return "\\\/";
1379    }
1380}
1381
1382sub get_re_dirsep {
1383
1384    return "\\\\|\\\/";
1385}
1386
1387
1388sub get_dirsep_tail {
1389    my ($filename) = @_;
1390   
1391    # returns last part of directory or filename
1392    # On unix e.g. a/b.d => b.d
1393    #              a/b/c => c
1394
1395    my $dirsep = get_re_dirsep();
1396    my @dirs = split (/$dirsep/, $filename);
1397    my $tail = pop @dirs;
1398
1399    # - caused problems under windows
1400    #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1401
1402    return $tail;
1403}
1404
1405
1406# if this is running on windows we want binaries to end in
1407# .exe, otherwise they don't have to end in any extension
1408sub get_os_exe {
1409    return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
1410    return "";
1411}
1412
1413
1414# test to see whether this is a big or little endian machine
1415sub is_little_endian
1416{
1417    # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1418    # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1419    # Otherwise, it's little endian
1420
1421    #return 0 if $^O =~ /^darwin$/i;
1422    #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1423   
1424    # Going back to stating exactly whether the machine is little endian
1425    # or big endian, without any special case for Macs. Since for rata it comes
1426    # back with little endian and for shuttle with bigendian.
1427    return (ord(substr(pack("s",1), 0, 1)) == 1);
1428}
1429
1430
1431# will return the collection name if successful, "" otherwise
1432sub use_collection {
1433    my ($collection, $collectdir) = @_;
1434
1435    if (!defined $collectdir || $collectdir eq "") {
1436    $collectdir = &filename_cat ($ENV{'GSDLHOME'}, "collect");
1437    }
1438
1439    # get and check the collection
1440    if (!defined($collection) || $collection eq "") {
1441    if (defined $ENV{'GSDLCOLLECTION'}) {
1442        $collection = $ENV{'GSDLCOLLECTION'};
1443    } else {
1444        print STDOUT "No collection specified\n";
1445        return "";
1446    }
1447    }
1448   
1449    if ($collection eq "modelcol") {
1450    print STDOUT "You can't use modelcol.\n";
1451    return "";
1452    }
1453
1454    # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1455    # are defined
1456    $ENV{'GSDLCOLLECTION'} = $collection;
1457    $ENV{'GSDLCOLLECTDIR'} = &filename_cat ($collectdir, $collection);
1458
1459    # make sure this collection exists
1460    if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1461    print STDOUT "Invalid collection ($collection).\n";
1462    return "";
1463    }
1464
1465    # everything is ready to go
1466    return $collection;
1467}
1468
1469sub get_current_collection_name {
1470    return $ENV{'GSDLCOLLECTION'};
1471}
1472
1473
1474# will return the collection name if successful, "" otherwise. 
1475# Like use_collection (above) but for greenstone 3 (taking account of site level)
1476
1477sub use_site_collection {
1478    my ($site, $collection, $collectdir) = @_;
1479
1480    if (!defined $collectdir || $collectdir eq "") {
1481    die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1482    $collectdir = &filename_cat ($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1483    }
1484
1485    # collectdir explicitly set by this point (using $site variable if required).
1486    # Can call "old" gsdl2 use_collection now.
1487
1488    return use_collection($collection,$collectdir);
1489}
1490
1491
1492
1493sub locate_config_file
1494{
1495    my ($file) = @_;
1496
1497    my $locations = locate_config_files($file);
1498
1499    return shift @$locations; # returns undef if 'locations' is empty
1500}
1501
1502
1503sub locate_config_files
1504{
1505    my ($file) = @_;
1506
1507    my @locations = ();
1508
1509    if (-e $file) {
1510    # Clearly specified (most likely full filename)
1511    # No need to hunt in 'etc' directories, return value unchanged
1512    push(@locations,$file);
1513    }
1514    else {
1515    # Check for collection specific one before looking in global GSDL 'etc'
1516    if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1517        my $test_collect_etc_filename
1518        = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1519       
1520        if (-e $test_collect_etc_filename) {
1521        push(@locations,$test_collect_etc_filename);
1522        }
1523    }
1524    my $test_main_etc_filename
1525        = &util::filename_cat($ENV{'GSDLHOME'},"etc", $file);
1526    if (-e $test_main_etc_filename) {
1527        push(@locations,$test_main_etc_filename);
1528    }
1529    }
1530
1531    return \@locations;
1532}
1533
1534
1535sub hyperlink_text
1536{
1537    my ($text) = @_;
1538   
1539    $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1540    $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1541
1542    return $text;
1543}
1544
1545
1546# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1547# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1548sub is_dir_empty
1549{
1550    my ($path) = @_;
1551    opendir DIR, $path;
1552    while(my $entry = readdir DIR) {
1553        next if($entry =~ /^\.\.?$/);
1554        closedir DIR;
1555        return 0;
1556    }
1557    closedir DIR;
1558    return 1;
1559}
1560
1561# Returns the given filename converted using either URL encoding or base64
1562# encoding, as specified by $rename_method. If the given filename has no suffix
1563# (if it is just the tailname), then $no_suffix should be some defined value.
1564# rename_method can be url, none, base64
1565sub rename_file {
1566    my ($filename, $rename_method, $no_suffix)  = @_;
1567
1568    if(!$filename) { # undefined or empty string
1569    return $filename;
1570    }
1571
1572    if (!$rename_method) {
1573    print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1574    # Debugging information
1575    # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1576    # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1577    $rename_method = "url";
1578    } elsif($rename_method eq "none") {
1579    return $filename; # would have already been renamed
1580    }
1581
1582    # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1583    ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1584    ###$filename =~ s/ /_/g;
1585
1586    my ($tailname,$dirname,$suffix);
1587    if($no_suffix) { # given a tailname, no suffix
1588    ($tailname,$dirname) = File::Basename::fileparse($filename);
1589    }
1590    else {
1591    ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1592    }
1593    if (!$suffix) {
1594    $suffix = "";
1595    }
1596    else {
1597    $suffix = lc($suffix);
1598    }
1599
1600    if ($rename_method eq "url") {
1601    $tailname = &unicode::url_encode($tailname);
1602    }
1603    elsif ($rename_method eq "base64") {
1604    $tailname = &unicode::base64_encode($tailname);
1605    $tailname =~ s/\s*//sg;      # for some reason it adds spaces not just at end but also in middle
1606    }
1607
1608    $filename = "$tailname$suffix";
1609    $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1610
1611    return $filename;
1612}
1613
1614
1615# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1616sub rename_ldb_or_bdb_file {
1617    my ($filename_no_ext) = @_;
1618
1619    my $new_filename = "$filename_no_ext.gdb";
1620    return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1621    # try ldb
1622    my $old_filename = "$filename_no_ext.ldb";
1623   
1624    if (-f $old_filename) {
1625    print STDERR "Renaming $old_filename to $new_filename\n";
1626    rename ($old_filename, $new_filename)
1627        || print STDERR "Rename failed: $!\n";
1628    return;
1629    }
1630    # try bdb
1631    $old_filename = "$filename_no_ext.bdb";
1632    if (-f $old_filename) {
1633    print STDERR "Renaming $old_filename to $new_filename\n";   
1634    rename ($old_filename, $new_filename)
1635        || print STDERR "Rename failed: $!\n";
1636    return;
1637    }
1638}
1639
1640sub os_dir() {
1641   
1642    my $gsdlarch = "";
1643    if(defined $ENV{'GSDLARCH'}) {
1644    $gsdlarch = $ENV{'GSDLARCH'};
1645    }
1646    return $ENV{'GSDLOS'}.$gsdlarch;
1647}
1648
1649# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1650# By default, /greenstone3 for GS3 or /greenstone for GS2.
1651sub get_greenstone_url_prefix() {
1652    # if already set on a previous occasion, just return that
1653    # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1654    return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1655
1656    my ($configfile, $urlprefix, $defaultUrlprefix);
1657    my @propertynames = ();
1658
1659    if($ENV{'GSDL3SRCHOME'}) {
1660    $defaultUrlprefix = "/greenstone3";
1661    $configfile = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1662    push(@propertynames, qw/path\s*\=/);
1663    } else {
1664    $defaultUrlprefix = "/greenstone";
1665    $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1666    push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1667    }
1668
1669    $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1670
1671    if(!$urlprefix) { # no values found for URL prefix, use default values
1672    $urlprefix = $defaultUrlprefix;
1673    } else {
1674    #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1675    $urlprefix =~ s/^\///; # remove the starting slash
1676    my @dirs = split(/(\\|\/)/, $urlprefix);
1677    $urlprefix = shift(@dirs);
1678
1679    if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1680        $urlprefix = "/$urlprefix";
1681    }
1682    }
1683
1684    # set for the future
1685    $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1686#    print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1687    return $urlprefix;
1688}
1689
1690
1691# Given a config file (xml or java properties file) and a list/array of regular expressions
1692# that represent property names to match on, this function will return the value for the 1st
1693# matching property name. If the return value is undefined, no matching property was found.
1694sub extract_propvalue_from_file() {
1695    my ($configfile, $propertynames) = @_;
1696
1697    my $value;
1698    unless(open(FIN, "<$configfile")) {
1699    print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1700    return $value; # not initialised
1701    }
1702
1703    # Read the entire file at once, as one single line, then close it
1704    my $filecontents;
1705    {
1706    local $/ = undef;       
1707    $filecontents = <FIN>;
1708    }
1709    close(FIN);
1710
1711    foreach my $regex (@$propertynames) {
1712        ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1713    if($value) {
1714            $value =~ s/^\"//;     # remove any startquotes
1715        $value =~ s/\".*$//;   # remove the 1st endquotes (if any) followed by any xml
1716        last;              # found value for a matching property, break from loop
1717    }
1718    }
1719
1720    return $value;
1721}
1722
1723# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1724# given that perllib is in @INC in order to invoke this subroutine.
1725# Call as follows -- after setting up INC to include perllib and
1726# after setting up GSDLHOME and GSDLOS:
1727#
1728# require util;
1729# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1730#
1731sub setup_greenstone_env() {
1732    my ($GSDLHOME, $GSDLOS) = @_;
1733
1734    #my %env_map = ();
1735    # Get the localised ENV settings of running a localised source setup.bash
1736    # and put it into the ENV here. Need to clear GSDLHOME before running setup
1737    #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1738    my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";     
1739    if($GSDLOS =~ m/windows/i) {
1740        #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1741        $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1742    }
1743    if (!open(PIN, "$perl_command |")) {
1744        print STDERR ("Unable to execute command: $perl_command. $!\n");
1745    }
1746
1747    while (defined (my $perl_output_line = <PIN>)) {
1748        my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1749        #$env_map{$key}=$value;     
1750        $ENV{$key}=$value;
1751    }
1752    close (PIN);
1753
1754    # If any keys in $ENV don't occur in Greenstone's localised env
1755    # (stored in $env_map), delete those entries from $ENV
1756    #foreach $key (keys %ENV) {
1757    #   if(!defined $env_map{$key}) {
1758    #       print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1759    #       delete $ENV{$key}; # del $ENV(key, value) pair
1760    #   }
1761    #}
1762    #undef %env_map;
1763}
1764
1765sub get_perl_exec() {   
1766    my $perl_exec = $^X; # may return just "perl"
1767   
1768    if($ENV{'PERLPATH'}) {
1769        # OR: # $perl_exec = &util::filename_cat($ENV{'PERLPATH'},"perl");
1770        if($ENV{'GSDLOS'} =~ m/windows/) {
1771            $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1772        } else {
1773            $perl_exec = "$ENV{'PERLPATH'}/perl";
1774        }
1775    } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1776        # containing the full path to the current perl executable we're using
1777        $perl_exec = $Config{perlpath}; # configured path for perl
1778        if (!-e $perl_exec) { # may not point to location on this machine
1779            $perl_exec = $^X; # may return just "perl"
1780            if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1781                print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";             
1782            }
1783        }
1784    }
1785   
1786    return $perl_exec;
1787}
1788
1789# returns the path to the java command in the JRE included with GS (if any),
1790# quoted to safeguard any spaces in this path, otherwise a simple java
1791# command is returned which assumes and will try for a system java.
1792sub get_java_command {
1793    my $java = "java";
1794    if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1795                               # after running setup.bat or from GLI which also runs setup.bat
1796    my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
1797    if(-d $java_bin) {
1798        $java = &util::filename_cat($java_bin,"java");
1799        $java = "\"".$java."\""; # quoted to preserve spaces in path
1800    }
1801    }
1802    return $java;
1803}
1804
1805
1806# Given the qualified collection name (colgroup/collection),
1807# returns the collection and colgroup parts
1808sub get_collection_parts {
1809    # http://perldoc.perl.org/File/Basename.html
1810    # my($filename, $directories, $suffix) = fileparse($path);
1811    # "$directories contains everything up to and including the last directory separator in the $path
1812    # including the volume (if applicable). The remainder of the $path is the $filename."
1813    #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);   
1814
1815    my $qualified_collection = shift(@_);
1816
1817    # Since activate.pl can be launched from the command-line, including by a user,
1818    # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1819    # Also allow for the accidental inclusion of multiple slashes
1820    my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1821   
1822    if(!defined $collection) {
1823        $collection = $colgroup;
1824        $colgroup = "";
1825    }
1826    return ($collection, $colgroup);
1827}
1828
1829# work out the "collectdir/collection" location
1830sub resolve_collection_dir {
1831    my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1832   
1833    my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);   
1834   
1835    if (defined $collect_dir) {
1836        return &util::filename_cat($collect_dir,$colgroup, $collection);
1837    }
1838    elsif (defined($ENV{'GSDLCOLLECTDIR'})) {
1839        return $ENV{'GSDLCOLLECTDIR'};
1840    }
1841    else {
1842        if (defined $site) {
1843            return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
1844        }
1845        else {
1846            return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
1847        }
1848    }
1849}
1850
1851# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1852# a directory containing sequentially numbered images.
1853sub create_itemfile
1854{
1855    my ($output_dir, $convert_basename, $convert_to) = @_;
1856    opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!"; 
1857
1858    my $page_num = "";
1859    my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1860
1861    # Sort files in the directory by page_num   
1862    sub page_number {
1863    my ($dir) = @_;
1864    my ($pagenum) =($dir =~ m/^.*[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1865
1866    $pagenum = 1 unless defined $pagenum;
1867    return $pagenum;
1868    }
1869
1870    # sort the files in the directory in the order of page_num rather than lexically.
1871    @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1872
1873    # work out if the numbering of the now sorted image files starts at 0 or not
1874    # by checking the number of the first _image_ file (skipping item files)
1875    my $starts_at_0 = 0;
1876    my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1877    if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1878    $starts_at_0 = 1;
1879    }
1880
1881    my $item_file = &util::filename_cat($output_dir, $convert_basename.".item");
1882    open(FILE,">$item_file");   
1883    print FILE "<PagedDocument>\n";
1884
1885    foreach my $file (@dir_files){
1886    if ($file !~ /\.item/i){
1887        $page_num = page_number($file);
1888        $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1889        print FILE "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1890    }
1891    }
1892
1893    print FILE "</PagedDocument>\n";
1894    close FILE;
1895    closedir DIR;
1896    return $item_file;
1897}
1898
18991;
Note: See TracBrowser for help on using the browser.