root/main/trunk/greenstone2/perllib/util.pm @ 25994

Revision 25994, 54.2 KB (checked in by ak19, 8 years ago)

Moving the updated create_itemfile() subroutine from pdfpstoimg.pl to util so that PDFBoxConverter can easily reuse this method too, as the PDFBoxConverter is currently being modified to convert a PDF to images when the -pagedimage_IMGTYPE flag is specified.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29
30use Encode;
31use File::Copy;
32use File::Basename;
33# Config for getting the perlpath in the recommended way, though it uses paths that are
34# hard-coded into the Config file that's generated upon configuring and compiling perl.
35# $^X works better in some cases to return the path to perl used to launch the script,
36# but if launched with plain "perl" (no full-path), that will be just what it returns.
37use Config;
38
39# removes files (but not directories)
40sub rm {
41    my (@files) = @_;
42
43    my @filefiles = ();
44
45    # make sure the files we want to delete exist
46    # and are regular files
47    foreach my $file (@files) {
48    if (!-e $file) {
49        print STDERR "util::rm $file does not exist\n";
50    } elsif ((!-f $file) && (!-l $file)) {
51        print STDERR "util::rm $file is not a regular (or symbolic) file\n";
52    } else {
53        push (@filefiles, $file);
54    }
55    }
56   
57    # remove the files
58    my $numremoved = unlink @filefiles;
59
60    # check to make sure all of them were removed
61    if ($numremoved != scalar(@filefiles)) {
62    print STDERR "util::rm Not all files were removed\n";
63    }
64}
65
66# removes files (but not directories) - can rename this to the default
67# "rm" subroutine  when debugging the deletion of individual files.
68sub rm_debug {
69    my (@files) = @_;
70    my @filefiles = ();
71
72    # make sure the files we want to delete exist
73    # and are regular files
74    foreach my $file (@files) {
75        if (!-e $file) {
76            print STDERR "util::rm $file does not exist\n";
77        } elsif ((!-f $file) && (!-l $file)) {
78            print STDERR "util::rm $file is not a regular (or symbolic) file\n";
79        } else { # debug message
80            unlink($file) or warn "Could not delete file $file: $!\n";
81        }
82    }
83}
84
85
86# recursive removal
87sub filtered_rm_r {
88    my ($files,$file_accept_re,$file_reject_re) = @_;
89
90#   my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(2);
91#   my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
92#   print STDERR "** Calling method (2): $lcfilename:$cline $cpackage->$csubr\n";
93   
94    my @files_array = (ref $files eq "ARRAY") ? @$files : ($files);
95
96    # recursively remove the files
97    foreach my $file (@files_array) {
98    $file =~ s/[\/\\]+$//; # remove trailing slashes
99   
100    if (!-e $file) {
101        print STDERR "util::filtered_rm_r $file does not exist\n";
102
103    } elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link
104        # get the contents of this directory
105        if (!opendir (INDIR, $file)) {
106        print STDERR "util::filtered_rm_r could not open directory $file\n";
107        } else {
108        my @filedir = grep (!/^\.\.?$/, readdir (INDIR));
109        closedir (INDIR);
110               
111        # remove all the files in this directory
112        map {$_="$file/$_";} @filedir;
113        &filtered_rm_r (\@filedir,$file_accept_re,$file_reject_re);
114
115        if (!defined $file_accept_re && !defined $file_reject_re) {
116            # remove this directory
117            if (!rmdir $file) {
118            print STDERR "util::filtered_rm_r couldn't remove directory $file\n";
119            }
120        }
121        }
122    } else {
123        next if (defined $file_reject_re && ($file =~ m/$file_reject_re/));
124
125        if ((!defined $file_accept_re) || ($file =~ m/$file_accept_re/)) {
126        # remove this file 
127        &rm ($file);
128        }
129    }
130    }
131}
132
133
134# recursive removal
135sub rm_r {
136    my (@files) = @_;
137   
138    # use the more general (but reterospectively written function
139    # filtered_rm_r function()
140
141    filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
142}
143
144
145
146
147# moves a file or a group of files
148sub mv {
149    my $dest = pop (@_);
150    my (@srcfiles) = @_;
151
152    # remove trailing slashes from source and destination files
153    $dest =~ s/[\\\/]+$//;
154    map {$_ =~ s/[\\\/]+$//;} @srcfiles;
155
156    # a few sanity checks
157    if (scalar (@srcfiles) == 0) {
158    print STDERR "util::mv no destination directory given\n";
159    return;
160    } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
161    print STDERR "util::mv if multiple source files are given the ".
162        "destination must be a directory\n";
163    return;
164    }
165
166    # move the files
167    foreach my $file (@srcfiles) {
168    my $tempdest = $dest;
169    if (-d $tempdest) {
170        my ($filename) = $file =~ /([^\\\/]+)$/;
171        $tempdest .= "/$filename";
172    }
173    if (!-e $file) {
174        print STDERR "util::mv $file does not exist\n";
175    } else {
176        if(!rename ($file, $tempdest)) {
177        print STDERR "**** Failed to rename $file to $tempdest\n";
178        &File::Copy::copy($file, $tempdest);
179        &rm($file);
180        }
181        elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
182        #print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
183        if(!-e $tempdest) {
184            print STDERR "@@@@ ERROR: $tempdest does not exist\n";
185        }
186        # Sometimes the rename operation fails (as does File::Copy::move).
187        # This turns out to be because the files are hardlinked.
188        # Need to do a copy-delete in this case, however, the copy step is not necessary:
189        # the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
190        #&File::Copy::copy($file, $tempdest);
191
192        &rm($file);     
193        }
194    }
195    }
196}
197
198# Move the contents of source directory into target directory
199# (as opposed to merely replacing target dir with the src dir)
200# This can overwrite any files with duplicate names in the target
201# but other files and folders in the target will continue to exist
202sub mv_dir_contents {
203    my ($src_dir, $dest_dir) = @_;
204   
205    # Obtain listing of all files within src_dir
206    # Note that readdir lists relative paths, as well as . and ..
207    opendir(DIR, "$src_dir");
208    my @files= readdir(DIR);
209    close(DIR);
210   
211    my @full_path_files = ();
212    foreach my $file (@files) {
213        # process all except . and ..
214        unless($file eq "." || $file eq "..") {
215           
216            my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
217       
218            # construct absolute paths
219            $file = &filename_cat($src_dir, $file); # $file is now an absolute path
220           
221            # Recurse on directories which have an equivalent in target dest_dir
222            # If $file is a directory that already exists in target $dest_dir,
223            # then a simple move operation will fail (definitely on Windows).
224            if(-d $file && -d $dest_subdir) {
225                #print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
226                &mv_dir_contents($file, $dest_subdir);
227               
228                # now all content is moved across, delete empty dir in source folder
229                if(&is_dir_empty($file)) {
230                    if (!rmdir $file) {
231                        print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
232                    }
233                } else { # error
234                    print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
235                }
236            } else { # process files and any directories that don't already exist with a simple move
237                push(@full_path_files, $file);
238            }           
239        }
240    }
241   
242    if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
243        &mk_dir($dest_dir);
244    }
245
246    #print STDERR "@@@@@ Copying files |".join(",", @full_path_files)."| to: $dest_dir\n";
247
248    if(@full_path_files) { # if non-empty, there's something to copy across
249        &mv(@full_path_files, $dest_dir);
250    }
251}
252
253
254# copies a file or a group of files
255sub cp {
256    my $dest = pop (@_);
257    my (@srcfiles) = @_;
258
259    # remove trailing slashes from source and destination files
260    $dest =~ s/[\\\/]+$//;
261    map {$_ =~ s/[\\\/]+$//;} @srcfiles;
262
263    # a few sanity checks
264    if (scalar (@srcfiles) == 0) {
265    print STDERR "util::cp no destination directory given\n";
266    return;
267    } elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
268    print STDERR "util::cp if multiple source files are given the ".
269        "destination must be a directory\n";
270    return;
271    }
272
273    # copy the files
274    foreach my $file (@srcfiles) {
275    my $tempdest = $dest;
276    if (-d $tempdest) {
277        my ($filename) = $file =~ /([^\\\/]+)$/;
278        $tempdest .= "/$filename";
279    }
280    if (!-e $file) {
281        print STDERR "util::cp $file does not exist\n";
282    } elsif (!-f $file) {
283        print STDERR "util::cp $file is not a plain file\n";
284    } else {
285        &File::Copy::copy ($file, $tempdest);
286    }
287    }
288}
289
290
291
292# recursively copies a file or group of files
293# syntax: cp_r (sourcefiles, destination directory)
294# destination must be a directory - to copy one file to
295# another use cp instead
296sub cp_r {
297    my $dest = pop (@_);
298    my (@srcfiles) = @_;
299
300    # a few sanity checks
301    if (scalar (@srcfiles) == 0) {
302    print STDERR "util::cp_r no destination directory given\n";
303    return;
304    } elsif (-f $dest) {
305    print STDERR "util::cp_r destination must be a directory\n";
306    return;
307    }
308   
309    # create destination directory if it doesn't exist already
310    if (! -d $dest) {
311    my $store_umask = umask(0002);
312    mkdir ($dest, 0777);
313    umask($store_umask);
314    }
315
316    # copy the files
317    foreach my $file (@srcfiles) {
318
319    if (!-e $file) {
320        print STDERR "util::cp_r $file does not exist\n";
321
322    } elsif (-d $file) {
323        # make the new directory
324        my ($filename) = $file =~ /([^\\\/]*)$/;
325        $dest = &util::filename_cat ($dest, $filename);
326        my $store_umask = umask(0002);
327        mkdir ($dest, 0777);
328        umask($store_umask);
329
330        # get the contents of this directory
331        if (!opendir (INDIR, $file)) {
332        print STDERR "util::cp_r could not open directory $file\n";
333        } else {
334        my @filedir = readdir (INDIR);
335        closedir (INDIR);
336        foreach my $f (@filedir) {
337            next if $f =~ /^\.\.?$/;
338            # copy all the files in this directory
339            my $ff = &util::filename_cat ($file, $f);
340            &cp_r ($ff, $dest);
341        }
342        }
343
344    } else {
345        &cp($file, $dest);
346    }
347    }
348}
349# recursively copies a file or group of files
350# syntax: cp_r (sourcefiles, destination directory)
351# destination must be a directory - to copy one file to
352# another use cp instead
353sub cp_r_nosvn {
354    my $dest = pop (@_);
355    my (@srcfiles) = @_;
356
357    # a few sanity checks
358    if (scalar (@srcfiles) == 0) {
359    print STDERR "util::cp_r no destination directory given\n";
360    return;
361    } elsif (-f $dest) {
362    print STDERR "util::cp_r destination must be a directory\n";
363    return;
364    }
365   
366    # create destination directory if it doesn't exist already
367    if (! -d $dest) {
368    my $store_umask = umask(0002);
369    mkdir ($dest, 0777);
370    umask($store_umask);
371    }
372
373    # copy the files
374    foreach my $file (@srcfiles) {
375
376    if (!-e $file) {
377        print STDERR "util::cp_r $file does not exist\n";
378
379    } elsif (-d $file) {
380        # make the new directory
381        my ($filename) = $file =~ /([^\\\/]*)$/;
382        $dest = &util::filename_cat ($dest, $filename);
383        my $store_umask = umask(0002);
384        mkdir ($dest, 0777);
385        umask($store_umask);
386
387        # get the contents of this directory
388        if (!opendir (INDIR, $file)) {
389        print STDERR "util::cp_r could not open directory $file\n";
390        } else {
391        my @filedir = readdir (INDIR);
392        closedir (INDIR);
393        foreach my $f (@filedir) {
394            next if $f =~ /^\.\.?$/;
395            next if $f =~ /^\.svn$/;
396            # copy all the files in this directory
397            my $ff = &util::filename_cat ($file, $f);
398            &cp_r ($ff, $dest);
399        }
400        }
401
402    } else {
403        &cp($file, $dest);
404    }
405    }
406}
407
408# copies a directory and its contents, excluding subdirectories, into a new directory
409sub cp_r_toplevel {
410    my $dest = pop (@_);
411    my (@srcfiles) = @_;
412
413    # a few sanity checks
414    if (scalar (@srcfiles) == 0) {
415    print STDERR "util::cp_r no destination directory given\n";
416    return;
417    } elsif (-f $dest) {
418    print STDERR "util::cp_r destination must be a directory\n";
419    return;
420    }
421   
422    # create destination directory if it doesn't exist already
423    if (! -d $dest) {
424    my $store_umask = umask(0002);
425    mkdir ($dest, 0777);
426    umask($store_umask);
427    }
428
429    # copy the files
430    foreach my $file (@srcfiles) {
431
432    if (!-e $file) {
433        print STDERR "util::cp_r $file does not exist\n";
434
435    } elsif (-d $file) {
436        # make the new directory
437        my ($filename) = $file =~ /([^\\\/]*)$/;
438        $dest = &util::filename_cat ($dest, $filename);
439        my $store_umask = umask(0002);
440        mkdir ($dest, 0777);
441        umask($store_umask);
442
443        # get the contents of this directory
444        if (!opendir (INDIR, $file)) {
445        print STDERR "util::cp_r could not open directory $file\n";
446        } else {
447        my @filedir = readdir (INDIR);
448        closedir (INDIR);
449        foreach my $f (@filedir) {
450            next if $f =~ /^\.\.?$/;
451           
452            # copy all the files in this directory, but not directories
453            my $ff = &util::filename_cat ($file, $f);
454            if (-f $ff) {
455            &cp($ff, $dest);
456            #&cp_r ($ff, $dest);
457            }
458        }
459        }
460
461    } else {
462        &cp($file, $dest);
463    }
464    }
465}
466
467sub mk_dir {
468    my ($dir) = @_;
469
470    my $store_umask = umask(0002);
471    my $mkdir_ok = mkdir ($dir, 0777);
472    umask($store_umask);
473   
474    if (!$mkdir_ok)
475    {
476    print STDERR "util::mk_dir could not create directory $dir\n";
477    return;
478    }
479}
480
481# in case anyone cares - I did some testing (using perls Benchmark module)
482# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
483# slightly faster (surprisingly) - Stefan.
484sub mk_all_dir {
485    my ($dir) = @_;
486
487    # use / for the directory separator, remove duplicate and
488    # trailing slashes
489    $dir=~s/[\\\/]+/\//g;
490    $dir=~s/[\\\/]+$//;
491
492    # make sure the cache directory exists
493    my $dirsofar = "";
494    my $first = 1;
495    foreach my $dirname (split ("/", $dir)) {
496    $dirsofar .= "/" unless $first;
497    $first = 0;
498
499    $dirsofar .= $dirname;
500
501    next if $dirname =~ /^(|[a-z]:)$/i;
502    if (!-e $dirsofar)
503        {
504        my $store_umask = umask(0002);
505        my $mkdir_ok = mkdir ($dirsofar, 0777);
506        umask($store_umask);
507        if (!$mkdir_ok)
508        {
509            print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
510            return;
511        }
512        }
513    }
514}
515
516# make hard link to file if supported by OS, otherwise copy the file
517sub hard_link {
518    my ($src, $dest, $verbosity) = @_;
519
520    # remove trailing slashes from source and destination files
521    $src =~ s/[\\\/]+$//;
522    $dest =~ s/[\\\/]+$//;
523
524##    print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
525    # a few sanity checks
526    if (-e $dest) {
527    # destination file already exists
528    return;
529    }
530    elsif (!-e $src) {
531    print STDERR "util::hard_link source file \"$src\" does not exist\n";
532    return 1;
533    }
534    elsif (-d $src) {
535    print STDERR "util::hard_link source \"$src\" is a directory\n";
536    return 1;
537    }
538
539    my $dest_dir = &File::Basename::dirname($dest);
540    mk_all_dir($dest_dir) if (!-e $dest_dir);
541
542
543    if (!link($src, $dest)) {
544    if ((!defined $verbosity) || ($verbosity>2)) {
545        print STDERR "util::hard_link: unable to create hard link. ";
546        print STDERR " Copying file: $src -> $dest\n";
547    }
548    &File::Copy::copy ($src, $dest);
549    }
550    return 0;
551}
552
553# make soft link to file if supported by OS, otherwise copy file
554sub soft_link {
555    my ($src, $dest, $ensure_paths_absolute) = @_;
556
557    # remove trailing slashes from source and destination files
558    $src =~ s/[\\\/]+$//;
559    $dest =~ s/[\\\/]+$//;
560
561    # Ensure file paths are absolute IF requested to do so
562    # Soft_linking didn't work for relative paths
563    if(defined $ensure_paths_absolute && $ensure_paths_absolute) {
564    # We need to ensure that the src file is the absolute path
565    # See http://perldoc.perl.org/File/Spec.html
566    if(!File::Spec->file_name_is_absolute( $src ))  { # it's relative
567        $src = File::Spec->rel2abs($src); # make absolute
568    }
569    # Might as well ensure that the destination file's absolute path is used
570    if(!File::Spec->file_name_is_absolute( $dest )) {
571        $dest = File::Spec->rel2abs($dest); # make absolute
572    }
573    }
574
575    # a few sanity checks
576    if (!-e $src) {
577    print STDERR "util::soft_link source file $src does not exist\n";
578    return 0;
579    }
580
581    my $dest_dir = &File::Basename::dirname($dest);
582    mk_all_dir($dest_dir) if (!-e $dest_dir);
583
584    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
585       
586    # symlink not supported on windows
587    &File::Copy::copy ($src, $dest);
588
589    } elsif (!eval {symlink($src, $dest)}) {
590    print STDERR "util::soft_link: unable to create soft link.\n";
591    return 0;
592    }
593
594    return 1;
595}
596
597# Primarily for filenames generated by processing
598# content of HTML files (which are mapped to UTF-8 internally)
599#
600# To turn this into an octet string that really exists on the file
601# system:
602# 1. don't need to do anything special for Unix-based systems
603#   (as underlying file system is byte-code)
604# 2. need to map to short DOS filenames for Windows
605
606sub utf8_to_real_filename
607{
608    my ($utf8_filename) = @_;
609
610    my $real_filename;
611
612    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
613    require Win32;
614
615    print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
616
617    my $unicode_filename = decode("utf8",$utf8_filename);
618    $real_filename = Win32::GetShortPathName($unicode_filename);
619    }
620    else {
621    $real_filename = $utf8_filename;
622    }
623
624    return $real_filename;
625}
626
627
628sub fd_exists
629{
630    my $filename_full_path = shift @_;
631    my $test_op = shift @_ || "-e";
632
633    # By default tests for existance of file or directory (-e)
634    # Can be made more specific by providing second parameter (e.g. -f or -d)
635
636    my $exists = 0;
637
638    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
639    require Win32;
640    my $filename_short_path = Win32::GetShortPathName($filename_full_path);
641    if (!defined $filename_short_path) {
642        # Was probably still in UTF8 form (not what is needed on Windows)
643        my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
644        if (defined $unicode_filename_full_path) {
645        $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
646        }
647    }
648    $filename_full_path = $filename_short_path;
649    }
650
651    if (defined $filename_full_path) {
652    $exists = eval "($test_op \$filename_full_path)";
653    }
654
655    return $exists;
656}
657
658sub file_exists
659{
660    my ($filename_full_path) = @_;
661
662    return fd_exists($filename_full_path,"-f");
663}
664
665sub dir_exists
666{
667    my ($filename_full_path) = @_;
668
669    return fd_exists($filename_full_path,"-d");
670}
671
672
673
674# updates a copy of a directory in some other part of the filesystem
675# verbosity settings are: 0=low, 1=normal, 2=high
676# both $fromdir and $todir should be absolute paths
677sub cachedir {
678    my ($fromdir, $todir, $verbosity) = @_;
679    $verbosity = 1 unless defined $verbosity;
680
681    # use / for the directory separator, remove duplicate and
682    # trailing slashes
683    $fromdir=~s/[\\\/]+/\//g;
684    $fromdir=~s/[\\\/]+$//;
685    $todir=~s/[\\\/]+/\//g;
686    $todir=~s/[\\\/]+$//;
687
688    &mk_all_dir ($todir);
689
690    # get the directories in ascending order
691    if (!opendir (FROMDIR, $fromdir)) {
692    print STDERR "util::cachedir could not read directory $fromdir\n";
693    return;
694    }
695    my @fromdir = grep (!/^\.\.?$/, sort(readdir (FROMDIR)));
696    closedir (FROMDIR);
697
698    if (!opendir (TODIR, $todir)) {
699    print STDERR "util::cacedir could not read directory $todir\n";
700    return;
701    }
702    my @todir = grep (!/^\.\.?$/, sort(readdir (TODIR)));
703    closedir (TODIR);
704
705    my $fromi = 0;
706    my $toi = 0;
707           
708    while ($fromi < scalar(@fromdir) || $toi < scalar(@todir)) {
709#   print "fromi: $fromi toi: $toi\n";
710
711    # see if we should delete a file/directory
712    # this should happen if the file/directory
713    # is not in the from list or if its a different
714    # size, or has an older timestamp
715    if ($toi < scalar(@todir)) {
716        if (($fromi >= scalar(@fromdir)) ||
717        ($todir[$toi] lt $fromdir[$fromi] ||
718         ($todir[$toi] eq $fromdir[$fromi] &&
719          &differentfiles("$fromdir/$fromdir[$fromi]","$todir/$todir[$toi]",
720                  $verbosity)))) {
721
722        # the files are different
723        &rm_r("$todir/$todir[$toi]");
724        splice(@todir, $toi, 1); # $toi stays the same
725
726        } elsif ($todir[$toi] eq $fromdir[$fromi]) {
727        # the files are the same
728        # if it is a directory, check its contents
729        if (-d "$todir/$todir[$toi]") {
730            &cachedir ("$fromdir/$fromdir[$fromi]",
731                   "$todir/$todir[$toi]", $verbosity);
732        }
733
734        $toi++;
735        $fromi++;
736        next;
737        }
738    }
739 
740    # see if we should insert a file/directory
741    # we should insert a file/directory if there
742    # is no tofiles left or if the tofile does not exist
743    if ($fromi < scalar(@fromdir) && ($toi >= scalar(@todir) ||
744                      $todir[$toi] gt $fromdir[$fromi])) {
745        &cp_r ("$fromdir/$fromdir[$fromi]", "$todir/$fromdir[$fromi]");
746        splice (@todir, $toi, 0, $fromdir[$fromi]);
747
748        $toi++;
749        $fromi++;
750    }
751    }
752}
753
754# this function returns -1 if either file is not found
755# assumes that $file1 and $file2 are absolute file names or
756# in the current directory
757# $file2 is allowed to be newer than $file1
758sub differentfiles {
759    my ($file1, $file2, $verbosity) = @_;
760    $verbosity = 1 unless defined $verbosity;
761
762    $file1 =~ s/\/+$//;
763    $file2 =~ s/\/+$//;
764   
765    my ($file1name) = $file1 =~ /\/([^\/]*)$/;
766    my ($file2name) = $file2 =~ /\/([^\/]*)$/;
767
768    return -1 unless (-e $file1 && -e $file2);
769    if ($file1name ne $file2name) {
770    print STDERR "filenames are not the same\n" if ($verbosity >= 2);
771    return 1;
772    }
773
774    my @file1stat = stat ($file1);
775    my @file2stat = stat ($file2);
776
777    if (-d $file1) {
778    if (! -d $file2) {
779        print STDERR "one file is a directory\n" if ($verbosity >= 2);
780        return 1;
781    }
782    return 0;
783    }
784
785    # both must be regular files
786    unless (-f $file1 && -f $file2) {
787    print STDERR "one file is not a regular file\n" if ($verbosity >= 2);
788    return 1;
789    }
790
791    # the size of the files must be the same
792    if ($file1stat[7] != $file2stat[7]) {
793    print STDERR "different sized files\n" if ($verbosity >= 2);
794    return 1;
795    }
796
797    # the second file cannot be older than the first
798    if ($file1stat[9] > $file2stat[9]) {
799    print STDERR "file is older\n" if ($verbosity >= 2);
800    return 1;
801    }
802
803    return 0;
804}
805
806
807sub get_tmp_filename
808{
809    my $file_ext = shift(@_) || undef;
810
811    my $opt_dot_file_ext = "";
812    if (defined $file_ext) {
813    if ($file_ext !~ m/\./) {
814        # no dot, so needs one added in at start
815        $opt_dot_file_ext = ".$file_ext"
816    }
817    else {
818        # allow for "extensions" such as _metadata.txt to be handled
819        # gracefully
820        $opt_dot_file_ext = $file_ext;
821    }
822    }
823
824    my $tmpdir = filename_cat($ENV{'GSDLHOME'}, "tmp");
825    &mk_all_dir ($tmpdir) unless -e $tmpdir;
826
827    my $count = 1000;
828    my $rand = int(rand $count);
829    my $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
830
831    while (-e $full_tmp_filename) {
832    $rand = int(rand $count);
833    $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
834    $count++;
835    }
836   
837    return $full_tmp_filename;
838}
839
840sub get_timestamped_tmp_folder
841{
842
843    my $tmp_dirname;
844    if(defined $ENV{'GSDLCOLLECTDIR'}) {
845    $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
846    } elsif(defined $ENV{'GSDLHOME'}) {
847    $tmp_dirname = $ENV{'GSDLHOME'};
848    } else {
849    return undef;
850    }
851
852    $tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
853    &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
854
855    # add the timestamp into the path otherwise we can run into problems
856    # if documents have the same name
857    my $timestamp = time;
858    my $time_tmp_dirname = &util::filename_cat($tmp_dirname, $timestamp);
859    $tmp_dirname = $time_tmp_dirname;
860    my $i = 1;
861    while (-e $tmp_dirname) {
862    $tmp_dirname = "$time_tmp_dirname$i";
863    $i++;
864    }
865    &util::mk_dir($tmp_dirname);
866 
867    return $tmp_dirname;
868}
869
870sub get_timestamped_tmp_filename_in_collection
871{
872
873    my ($input_filename, $output_ext) = @_;
874    # derive tmp filename from input filename
875    my ($tailname, $dirname, $suffix)
876    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
877
878    # softlink to collection tmp dir
879    my $tmp_dirname = &util::get_timestamped_tmp_folder();
880    $tmp_dirname = $dirname unless defined $tmp_dirname;
881
882    # following two steps copied from ConvertBinaryFile
883    # do we need them?? can't use them as is, as they use plugin methods.
884
885    #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
886
887    # URLEncode this since htmls with images where the html filename is utf8 don't seem
888    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
889    # files on the filesystem.
890    #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
891    if (defined $output_ext) {
892    $output_ext = ".$output_ext"; # add the dot
893    } else {
894    $output_ext = $suffix;
895    }
896    $output_ext= lc($output_ext);
897    my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$output_ext");
898   
899    return $tmp_filename;
900}
901
902sub get_toplevel_tmp_dir
903{
904    return filename_cat($ENV{'GSDLHOME'}, "tmp");
905}
906
907
908sub filename_to_regex {
909    my $filename = shift (@_);
910
911    # need to make single backslashes double so that regex works
912    $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);   
913   
914    # note that the first part of a substitution is a regex, so RE chars need to be escaped,
915    # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
916    $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
917    $filename =~ s@\(@\\(@g; # escape brackets
918    $filename =~ s@\)@\\)@g; # escape brackets
919    $filename =~ s@\[@\\[@g; # escape brackets
920    $filename =~ s@\]@\\]@g; # escape brackets
921   
922    return $filename;
923}
924
925sub unregex_filename {
926    my $filename = shift (@_);
927
928    # need to put doubled backslashes for regex back to single
929    $filename =~ s/\\\./\./g; # remove RE syntax for .
930    $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
931    $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
932    $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
933    $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
934   
935    # \\ goes to \
936    # This is the last step in reverse mirroring the order of steps in filename_to_regex()
937    $filename =~ s/\\\\/\\/g; # remove RE syntax for \   
938    return $filename;
939}
940
941sub filename_cat {
942    my $first_file = shift(@_);
943    my (@filenames) = @_;
944
945#   Useful for debugging
946#     -- might make sense to call caller(0) rather than (1)??
947#   my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
948#   print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
949   
950    # If first_file is not null or empty, then add it back into the list
951    if (defined $first_file && $first_file =~ /\S/) {
952    unshift(@filenames, $first_file);
953    }
954
955    my $filename = join("/", @filenames);
956
957    # remove duplicate slashes and remove the last slash
958    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
959    $filename =~ s/[\\\/]+/\\/g;
960    } else {
961    $filename =~ s/[\/]+/\//g;
962    # DB: want a filename abc\de.html to remain like this
963    }
964    $filename =~ s/[\\\/]$//;
965
966    return $filename;
967}
968
969
970sub pathname_cat {
971    my $first_path = shift(@_);
972    my (@pathnames) = @_;
973
974    # If first_path is not null or empty, then add it back into the list
975    if (defined $first_path && $first_path =~ /\S/) {
976    unshift(@pathnames, $first_path);
977    }
978
979    my $join_char;
980    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
981    $join_char = ";";
982    } else {
983    $join_char = ":";
984    }
985
986    my $pathname = join($join_char, @pathnames);
987
988    # remove duplicate slashes
989    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
990    $pathname =~ s/[\\\/]+/\\/g;
991    } else {
992    $pathname =~ s/[\/]+/\//g;
993    # DB: want a pathname abc\de.html to remain like this
994    }
995
996    return $pathname;
997}
998
999
1000sub tidy_up_oid {
1001    my ($OID) = @_;
1002    if ($OID =~ /\./) {
1003    print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
1004    $OID =~ s/\.//g; #remove any periods
1005    }
1006    if ($OID =~ /^\s.*\s$/) {
1007    print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
1008    # remove starting and trailing whitespace
1009    $OID =~ s/^\s+//;
1010    $OID =~ s/\s+$//;
1011    }
1012    if ($OID =~ /^[\d]*$/) {
1013    print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
1014    $OID = "D" . $OID;
1015    }       
1016   
1017    return $OID;
1018}
1019sub envvar_prepend {
1020    my ($var,$val) = @_;
1021
1022    # do not prepend any value/path that's already in the environment variable
1023   
1024    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1025    if (!defined($ENV{$var})) {
1026    $ENV{$var} = "$val";
1027    }
1028    elsif($ENV{$var} !~ m/$escaped_val/) {
1029    $ENV{$var} = "$val;".$ENV{$var};
1030    }
1031}
1032
1033sub envvar_append {
1034    my ($var,$val) = @_;
1035   
1036    # do not append any value/path that's already in the environment variable
1037   
1038    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1039    if (!defined($ENV{$var})) {
1040    $ENV{$var} = "$val";
1041    }
1042    elsif($ENV{$var} !~ m/$escaped_val/) {
1043    $ENV{$var} .= ";$val";
1044    }
1045}
1046
1047
1048# splits a filename into a prefix and a tail extension using the tail_re, or
1049# if that fails, splits on the file_extension . (dot)
1050sub get_prefix_and_tail_by_regex {
1051
1052    my ($filename,$tail_re) = @_;
1053   
1054    my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
1055    if ((!defined $file_prefix) || (!defined $file_ext)) {
1056    ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
1057    }
1058
1059    return ($file_prefix,$file_ext);
1060}
1061
1062# get full path and file only path from a base_dir (which may be empty) and
1063# file (which may contain directories)
1064sub get_full_filenames {
1065    my ($base_dir, $file) = @_;
1066   
1067    my $filename_full_path = $file;
1068    # add on directory if present
1069    $filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\S/;
1070   
1071    my $filename_no_path = $file;
1072
1073    # remove directory if present
1074    $filename_no_path =~ s/^.*[\/\\]//;
1075    return ($filename_full_path, $filename_no_path);
1076}
1077
1078# returns the path of a file without the filename -- ie. the directory the file is in
1079sub filename_head {
1080    my $filename = shift(@_);
1081
1082    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1083    $filename =~ s/[^\\\\]*$//;
1084    }
1085    else {
1086    $filename =~ s/[^\\\/]*$//;
1087    }
1088
1089    return $filename;
1090}
1091
1092
1093
1094# returns 1 if filename1 and filename2 point to the same
1095# file or directory
1096sub filenames_equal {
1097    my ($filename1, $filename2) = @_;
1098
1099    # use filename_cat to clean up trailing slashes and
1100    # multiple slashes
1101    $filename1 = filename_cat ($filename1);
1102    $filename2 = filename_cat ($filename2);
1103
1104    # filenames not case sensitive on windows
1105    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1106    $filename1 =~ tr/[A-Z]/[a-z]/;
1107    $filename2 =~ tr/[A-Z]/[a-z]/;
1108    }
1109    return 1 if $filename1 eq $filename2;
1110    return 0;
1111}
1112
1113# If filename is relative to within_dir, returns the relative path of filename to that directory
1114# with slashes in the filename returned as they were in the original (absolute) filename.
1115sub filename_within_directory
1116{
1117    my ($filename,$within_dir) = @_;
1118   
1119    if ($within_dir !~ m/[\/\\]$/) {
1120    my $dirsep = &util::get_dirsep();
1121    $within_dir .= $dirsep;
1122    }
1123   
1124    $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets   
1125    if ($filename =~ m/^$within_dir(.*)$/) {
1126    $filename = $1;
1127    }
1128   
1129    return $filename;
1130}
1131
1132# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
1133# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
1134# The subpath returned will also be a URL type filename.
1135sub filename_within_directory_url_format
1136{
1137    my ($filename,$within_dir) = @_;
1138   
1139    # convert parameters only to / slashes if Windows
1140   
1141    my $filename_urlformat = &filepath_to_url_format($filename);
1142    my $within_dir_urlformat = &filepath_to_url_format($within_dir);
1143
1144    #if ($within_dir_urlformat !~ m/\/$/) {
1145        # make sure directory ends with a slash
1146        #$within_dir_urlformat .= "/";
1147    #}
1148   
1149    my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
1150   
1151    #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
1152   
1153    # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
1154    if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
1155        $filename_urlformat = $1;
1156    }
1157   
1158    return $filename_urlformat;
1159}
1160
1161# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
1162# since on Linux it doesn't represent a file separator but an escape char).
1163sub filepath_to_url_format
1164{
1165    my ($filepath) = @_;
1166    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1167        # Only need to worry about Windows, as Unix style directories already in url-format
1168        # Convert Windows style \ => /
1169        $filepath =~ s@\\@/@g;     
1170    }
1171    return $filepath;
1172}
1173
1174# regex filepaths on windows may include \\ as path separator. Convert \\ to /
1175sub filepath_regex_to_url_format
1176{
1177    my ($filepath) = @_;
1178    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1179    # Only need to worry about Windows, as Unix style directories already in url-format
1180    # Convert Windows style \\ => /
1181    $filepath =~ s@\\\\@/@g;       
1182    }
1183    return $filepath;
1184   
1185}
1186
1187# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
1188# and ignores trailing /
1189# returns (file, dirs) dirs will be empty if no subdirs
1190sub url_fileparse
1191{
1192    my ($filepath) = @_;
1193    # remove trailing /
1194    $filepath =~ s@/$@@;
1195    if ($filepath !~ m@/@) {
1196    return ($filepath, "");
1197    }
1198    my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
1199    return ($file, $dirs);
1200   
1201}
1202
1203
1204sub filename_within_collection
1205{
1206    my ($filename) = @_;
1207
1208    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
1209   
1210    if (defined $collect_dir) {
1211
1212    # if from within GSDLCOLLECTDIR, then remove directory prefix
1213    # so source_filename is realative to it.  This is done to aid
1214    # portability, i.e. the collection can be moved to somewhere
1215    # else on the file system and the archives directory will still
1216    # work.  This is needed, for example in the applet version of
1217    # GLI where GSDLHOME/collect on the server will be different to
1218    # the collect directory of the remove user.  Of course,
1219    # GSDLCOLLECTDIR subsequently needs to be put back on to turn
1220    # it back into a full pathname.
1221
1222    $filename = filename_within_directory($filename,$collect_dir);
1223    }
1224   
1225    return $filename;
1226}
1227
1228sub prettyprint_file
1229{
1230    my ($base_dir,$file,$gli) = @_;
1231
1232    my $filename_full_path = &util::filename_cat($base_dir,$file);
1233
1234    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1235    require Win32;
1236
1237    # For some reason base_dir in the form c:/a/b/c
1238    # This leads to confusion later on, so turn it back into
1239    # the more usual Windows form
1240    $base_dir =~ s/\//\\/g;
1241    my $long_base_dir = Win32::GetLongPathName($base_dir);
1242    my $long_full_path = Win32::GetLongPathName($filename_full_path);
1243
1244    $file = filename_within_directory($long_full_path,$long_base_dir);
1245    $file = encode("utf8",$file) if ($gli);
1246    }
1247
1248    return $file;
1249}
1250
1251
1252sub upgrade_if_dos_filename
1253{
1254    my ($filename_full_path,$and_encode) = @_;
1255
1256    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1257    # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
1258    # to its long (Windows) version
1259    my $long_filename = Win32::GetLongPathName($filename_full_path);
1260    if (defined $long_filename) {
1261        $filename_full_path = $long_filename;
1262    }
1263    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
1264    $filename_full_path =~ s/^(.):/\u$1:/;
1265    if ((defined $and_encode) && ($and_encode)) {
1266        $filename_full_path = encode("utf8",$filename_full_path);
1267    }
1268    }
1269
1270    return $filename_full_path;
1271}
1272
1273
1274sub downgrade_if_dos_filename
1275{
1276    my ($filename_full_path) = @_;
1277
1278    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1279    require Win32;
1280
1281    # Ensure the given long Windows filename is in a form that can
1282    # be opened by Perl => convert it to a short DOS-like filename
1283
1284    my $short_filename = Win32::GetShortPathName($filename_full_path);
1285    if (defined $short_filename) {
1286        $filename_full_path = $short_filename;
1287    }
1288    # Make sure initial drive letter is lower-case (to fit in
1289    # with rest of Greenstone)
1290    $filename_full_path =~ s/^(.):/\u$1:/;
1291    }
1292
1293    return $filename_full_path;
1294}
1295
1296sub block_filename
1297{
1298    my ($block_hash,$filename) = @_;
1299   
1300    if ($ENV{'GSDLOS'} =~ m/^windows$/) {
1301   
1302    # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
1303    my $lower_filename = lc($filename);
1304    $block_hash->{'file_blocks'}->{$lower_filename} = 1;
1305#   my $lower_drive = $filename;
1306#   $lower_drive =~ s/^([A-Z]):/\l$1:/i;
1307   
1308#   my $upper_drive = $filename;
1309#   $upper_drive =~ s/^([A-Z]):/\u$1:/i;
1310#   
1311#   $block_hash->{'file_blocks'}->{$lower_drive} = 1;
1312#   $block_hash->{'file_blocks'}->{$upper_drive} = 1;       
1313    }
1314    else {
1315    $block_hash->{'file_blocks'}->{$filename} = 1;
1316    }
1317}
1318
1319
1320sub filename_is_absolute
1321{
1322    my ($filename) = @_;
1323
1324    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1325    return ($filename =~ m/^(\w:)?\\/);
1326    }
1327    else {
1328    return ($filename =~ m/^\//);
1329    }
1330}
1331
1332
1333## @method make_absolute()
1334#
1335#  Ensure the given file path is absolute in respect to the given base path.
1336#
1337#  @param  $base_dir A string denoting the base path the given dir must be
1338#                    absolute to.
1339#  @param  $dir The directory to be made absolute as a string. Note that the
1340#               dir may already be absolute, in which case it will remain
1341#               unchanged.
1342#  @return The now absolute form of the directory as a string.
1343#
1344#  @author John Thompson, DL Consulting Ltd.
1345#  @copy 2006 DL Consulting Ltd.
1346#
1347#used in buildcol.pl, doesn't work for all cases --kjdon
1348sub make_absolute {
1349   
1350    my ($base_dir, $dir) = @_;
1351###    print STDERR "dir = $dir\n";
1352    $dir =~ s/[\\\/]+/\//g;
1353    $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
1354    $dir =~ s|^/tmp_mnt||;
1355    1 while($dir =~ s|/[^/]*/\.\./|/|g);
1356    $dir =~ s|/[.][.]?/|/|g;
1357    $dir =~ tr|/|/|s;
1358###    print STDERR "dir = $dir\n";
1359   
1360    return $dir;
1361}
1362## make_absolute() ##
1363
1364sub get_dirsep {
1365
1366    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1367    return "\\";
1368    } else {
1369    return "\/";
1370    }
1371}
1372
1373sub get_os_dirsep {
1374
1375    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1376    return "\\\\";
1377    } else {
1378    return "\\\/";
1379    }
1380}
1381
1382sub get_re_dirsep {
1383
1384    return "\\\\|\\\/";
1385}
1386
1387
1388sub get_dirsep_tail {
1389    my ($filename) = @_;
1390   
1391    # returns last part of directory or filename
1392    # On unix e.g. a/b.d => b.d
1393    #              a/b/c => c
1394
1395    my $dirsep = get_re_dirsep();
1396    my @dirs = split (/$dirsep/, $filename);
1397    my $tail = pop @dirs;
1398
1399    # - caused problems under windows
1400    #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1401
1402    return $tail;
1403}
1404
1405
1406# if this is running on windows we want binaries to end in
1407# .exe, otherwise they don't have to end in any extension
1408sub get_os_exe {
1409    return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
1410    return "";
1411}
1412
1413
1414# test to see whether this is a big or little endian machine
1415sub is_little_endian
1416{
1417    # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1418    # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1419    # Otherwise, it's little endian
1420
1421    #return 0 if $^O =~ /^darwin$/i;
1422    #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1423   
1424    # Going back to stating exactly whether the machine is little endian
1425    # or big endian, without any special case for Macs. Since for rata it comes
1426    # back with little endian and for shuttle with bigendian.
1427    return (ord(substr(pack("s",1), 0, 1)) == 1);
1428}
1429
1430
1431# will return the collection name if successful, "" otherwise
1432sub use_collection {
1433    my ($collection, $collectdir) = @_;
1434
1435    if (!defined $collectdir || $collectdir eq "") {
1436    $collectdir = &filename_cat ($ENV{'GSDLHOME'}, "collect");
1437    }
1438
1439    # get and check the collection
1440    if (!defined($collection) || $collection eq "") {
1441    if (defined $ENV{'GSDLCOLLECTION'}) {
1442        $collection = $ENV{'GSDLCOLLECTION'};
1443    } else {
1444        print STDOUT "No collection specified\n";
1445        return "";
1446    }
1447    }
1448   
1449    if ($collection eq "modelcol") {
1450    print STDOUT "You can't use modelcol.\n";
1451    return "";
1452    }
1453
1454    # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1455    # are defined
1456    $ENV{'GSDLCOLLECTION'} = $collection;
1457    $ENV{'GSDLCOLLECTDIR'} = &filename_cat ($collectdir, $collection);
1458
1459    # make sure this collection exists
1460    if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1461    print STDOUT "Invalid collection ($collection).\n";
1462    return "";
1463    }
1464
1465    # everything is ready to go
1466    return $collection;
1467}
1468
1469sub get_current_collection_name {
1470    return $ENV{'GSDLCOLLECTION'};
1471}
1472
1473
1474# will return the collection name if successful, "" otherwise. 
1475# Like use_collection (above) but for greenstone 3 (taking account of site level)
1476
1477sub use_site_collection {
1478    my ($site, $collection, $collectdir) = @_;
1479
1480    if (!defined $collectdir || $collectdir eq "") {
1481    die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1482    $collectdir = &filename_cat ($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1483    }
1484
1485    # collectdir explicitly set by this point (using $site variable if required).
1486    # Can call "old" gsdl2 use_collection now.
1487
1488    return use_collection($collection,$collectdir);
1489}
1490
1491
1492
1493sub locate_config_file
1494{
1495    my ($file) = @_;
1496
1497    my $locations = locate_config_files($file);
1498
1499    return shift @$locations; # returns undef if 'locations' is empty
1500}
1501
1502
1503sub locate_config_files
1504{
1505    my ($file) = @_;
1506
1507    my @locations = ();
1508
1509    if (-e $file) {
1510    # Clearly specified (most likely full filename)
1511    # No need to hunt in 'etc' directories, return value unchanged
1512    push(@locations,$file);
1513    }
1514    else {
1515    # Check for collection specific one before looking in global GSDL 'etc'
1516    if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1517        my $test_collect_etc_filename
1518        = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1519       
1520        if (-e $test_collect_etc_filename) {
1521        push(@locations,$test_collect_etc_filename);
1522        }
1523    }
1524    my $test_main_etc_filename
1525        = &util::filename_cat($ENV{'GSDLHOME'},"etc", $file);
1526    if (-e $test_main_etc_filename) {
1527        push(@locations,$test_main_etc_filename);
1528    }
1529    }
1530
1531    return \@locations;
1532}
1533
1534
1535sub hyperlink_text
1536{
1537    my ($text) = @_;
1538   
1539    $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1540    $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1541
1542    return $text;
1543}
1544
1545
1546# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1547# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1548sub is_dir_empty
1549{
1550    my ($path) = @_;
1551    opendir DIR, $path;
1552    while(my $entry = readdir DIR) {
1553        next if($entry =~ /^\.\.?$/);
1554        closedir DIR;
1555        return 0;
1556    }
1557    closedir DIR;
1558    return 1;
1559}
1560
1561# Returns the given filename converted using either URL encoding or base64
1562# encoding, as specified by $rename_method. If the given filename has no suffix
1563# (if it is just the tailname), then $no_suffix should be some defined value.
1564# rename_method can be url, none, base64
1565sub rename_file {
1566    my ($filename, $rename_method, $no_suffix)  = @_;
1567
1568    if(!$filename) { # undefined or empty string
1569    return $filename;
1570    }
1571
1572    if (!$rename_method) {
1573    print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1574    # Debugging information
1575    # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1576    # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1577    $rename_method = "url";
1578    } elsif($rename_method eq "none") {
1579    return $filename; # would have already been renamed
1580    }
1581
1582    # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1583    ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1584    ###$filename =~ s/ /_/g;
1585
1586    my ($tailname,$dirname,$suffix);
1587    if($no_suffix) { # given a tailname, no suffix
1588    ($tailname,$dirname) = File::Basename::fileparse($filename);
1589    }
1590    else {
1591    ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1592    }
1593    if (!$suffix) {
1594    $suffix = "";
1595    }
1596    else {
1597    $suffix = lc($suffix);
1598    }
1599
1600    if ($rename_method eq "url") {
1601    $tailname = &unicode::url_encode($tailname);
1602    }
1603    elsif ($rename_method eq "base64") {
1604    $tailname = &unicode::base64_encode($tailname);
1605    $tailname =~ s/\s*//sg;      # for some reason it adds spaces not just at end but also in middle
1606    }
1607
1608    $filename = "$tailname$suffix";
1609    $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1610
1611    return $filename;
1612}
1613
1614
1615# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1616sub rename_ldb_or_bdb_file {
1617    my ($filename_no_ext) = @_;
1618
1619    my $new_filename = "$filename_no_ext.gdb";
1620    return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1621    # try ldb
1622    my $old_filename = "$filename_no_ext.ldb";
1623   
1624    if (-f $old_filename) {
1625    print STDERR "Renaming $old_filename to $new_filename\n";
1626    rename ($old_filename, $new_filename)
1627        || print STDERR "Rename failed: $!\n";
1628    return;
1629    }
1630    # try bdb
1631    $old_filename = "$filename_no_ext.bdb";
1632    if (-f $old_filename) {
1633    print STDERR "Renaming $old_filename to $new_filename\n";   
1634    rename ($old_filename, $new_filename)
1635        || print STDERR "Rename failed: $!\n";
1636    return;
1637    }
1638}
1639
1640sub os_dir() {
1641   
1642    my $gsdlarch = "";
1643    if(defined $ENV{'GSDLARCH'}) {
1644    $gsdlarch = $ENV{'GSDLARCH'};
1645    }
1646    return $ENV{'GSDLOS'}.$gsdlarch;
1647}
1648
1649# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1650# By default, /greenstone3 for GS3 or /greenstone for GS2.
1651sub get_greenstone_url_prefix() {
1652    # if already set on a previous occasion, just return that
1653    # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1654    return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1655
1656    my ($configfile, $urlprefix, $defaultUrlprefix);
1657    my @propertynames = ();
1658
1659    if($ENV{'GSDL3SRCHOME'}) {
1660    $defaultUrlprefix = "/greenstone3";
1661    $configfile = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1662    push(@propertynames, qw/path\s*\=/);
1663    } else {
1664    $defaultUrlprefix = "/greenstone";
1665    $configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1666    push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1667    }
1668
1669    $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1670
1671    if(!$urlprefix) { # no values found for URL prefix, use default values
1672    $urlprefix = $defaultUrlprefix;
1673    } else {
1674    #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1675    $urlprefix =~ s/^\///; # remove the starting slash
1676    my @dirs = split(/(\\|\/)/, $urlprefix);
1677    $urlprefix = shift(@dirs);
1678
1679    if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1680        $urlprefix = "/$urlprefix";
1681    }
1682    }
1683
1684    # set for the future
1685    $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1686#    print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1687    return $urlprefix;
1688}
1689
1690
1691# Given a config file (xml or java properties file) and a list/array of regular expressions
1692# that represent property names to match on, this function will return the value for the 1st
1693# matching property name. If the return value is undefined, no matching property was found.
1694sub extract_propvalue_from_file() {
1695    my ($configfile, $propertynames) = @_;
1696
1697    my $value;
1698    unless(open(FIN, "<$configfile")) {
1699    print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1700    return $value; # not initialised
1701    }
1702
1703    # Read the entire file at once, as one single line, then close it
1704    my $filecontents;
1705    {
1706    local $/ = undef;       
1707    $filecontents = <FIN>;
1708    }
1709    close(FIN);
1710
1711    foreach my $regex (@$propertynames) {
1712        ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1713    if($value) {
1714            $value =~ s/^\"//;     # remove any startquotes
1715        $value =~ s/\".*$//;   # remove the 1st endquotes (if any) followed by any xml
1716        last;              # found value for a matching property, break from loop
1717    }
1718    }
1719
1720    return $value;
1721}
1722
1723# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1724# given that perllib is in @INC in order to invoke this subroutine.
1725# Call as follows -- after setting up INC to include perllib and
1726# after setting up GSDLHOME and GSDLOS:
1727#
1728# require util;
1729# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1730#
1731sub setup_greenstone_env() {
1732    my ($GSDLHOME, $GSDLOS) = @_;
1733
1734    #my %env_map = ();
1735    # Get the localised ENV settings of running a localised source setup.bash
1736    # and put it into the ENV here. Need to clear GSDLHOME before running setup
1737    #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1738    my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";     
1739    if($GSDLOS =~ m/windows/i) {
1740        #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1741        $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1742    }
1743    if (!open(PIN, "$perl_command |")) {
1744        print STDERR ("Unable to execute command: $perl_command. $!\n");
1745    }
1746
1747    while (defined (my $perl_output_line = <PIN>)) {
1748        my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1749        #$env_map{$key}=$value;     
1750        $ENV{$key}=$value;
1751    }
1752    close (PIN);
1753
1754    # If any keys in $ENV don't occur in Greenstone's localised env
1755    # (stored in $env_map), delete those entries from $ENV
1756    #foreach $key (keys %ENV) {
1757    #   if(!defined $env_map{$key}) {
1758    #       print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1759    #       delete $ENV{$key}; # del $ENV(key, value) pair
1760    #   }
1761    #}
1762    #undef %env_map;
1763}
1764
1765sub get_perl_exec() {   
1766    my $perl_exec = $^X; # may return just "perl"
1767   
1768    if($ENV{'PERLPATH'}) {
1769        # OR: # $perl_exec = &util::filename_cat($ENV{'PERLPATH'},"perl");
1770        if($ENV{'GSDLOS'} =~ m/windows/) {
1771            $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1772        } else {
1773            $perl_exec = "$ENV{'PERLPATH'}/perl";
1774        }
1775    } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1776        # containing the full path to the current perl executable we're using
1777        $perl_exec = $Config{perlpath}; # configured path for perl
1778        if (!-e $perl_exec) { # may not point to location on this machine
1779            $perl_exec = $^X; # may return just "perl"
1780            if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1781                print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";             
1782            }
1783        }
1784    }
1785   
1786    return $perl_exec;
1787}
1788
1789# returns the path to the java command in the JRE included with GS (if any),
1790# quoted to safeguard any spaces in this path, otherwise a simple java
1791# command is returned which assumes and will try for a system java.
1792sub get_java_command {
1793    my $java = "java";
1794    if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1795                               # after running setup.bat or from GLI which also runs setup.bat
1796    my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
1797    if(-d $java_bin) {
1798        $java = &util::filename_cat($java_bin,"java");
1799        $java = "\"".$java."\""; # quoted to preserve spaces in path
1800    }
1801    }
1802    return $java;
1803}
1804
1805
1806# Given the qualified collection name (colgroup/collection),
1807# returns the collection and colgroup parts
1808sub get_collection_parts {
1809    # http://perldoc.perl.org/File/Basename.html
1810    # my($filename, $directories, $suffix) = fileparse($path);
1811    # "$directories contains everything up to and including the last directory separator in the $path
1812    # including the volume (if applicable). The remainder of the $path is the $filename."
1813    #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);   
1814
1815    my $qualified_collection = shift(@_);
1816
1817    # Since activate.pl can be launched from the command-line, including by a user,
1818    # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1819    # Also allow for the accidental inclusion of multiple slashes
1820    my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1821   
1822    if(!defined $collection) {
1823        $collection = $colgroup;
1824        $colgroup = "";
1825    }
1826    return ($collection, $colgroup);
1827}
1828
1829# work out the "collectdir/collection" location
1830sub resolve_collection_dir {
1831    my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1832   
1833    my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);   
1834   
1835    if (defined $collect_dir) {
1836        return &util::filename_cat($collect_dir,$colgroup, $collection);
1837    }
1838    elsif (defined($ENV{'GSDLCOLLECTDIR'})) {
1839        return $ENV{'GSDLCOLLECTDIR'};
1840    }
1841    else {
1842        if (defined $site) {
1843            return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
1844        }
1845        else {
1846            return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
1847        }
1848    }
1849}
1850
1851# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1852# a directory containing sequentially numbered images.
1853sub create_itemfile
1854{
1855    my ($output_dir, $convert_basename, $convert_to) = @_;
1856    opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!"; 
1857
1858    my $page_num = "";
1859    my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1860
1861    # Sort files in the directory by page_num   
1862    sub page_number {
1863    my ($dir) = @_;
1864    my ($pagenum) =($dir =~ m/^.*[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1865
1866    $pagenum = 1 unless defined $pagenum;
1867    return $pagenum;
1868    }
1869
1870    # sort the files in the directory in the order of page_num rather than lexically.
1871    @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1872
1873    # work out if the numbering of the now sorted image files starts at 0 or not
1874    # by checking the number of the first _image_ file (skipping item files)
1875    my $starts_at_0 = 0;
1876    my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1877    if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1878    $starts_at_0 = 1;
1879    }
1880
1881    my $item_file = &util::filename_cat($output_dir, $convert_basename.".item");
1882    open(FILE,">$item_file");   
1883    print FILE "<PagedDocument>\n";
1884
1885    foreach my $file (@dir_files){
1886    if ($file !~ /\.item/i){
1887        $page_num = page_number($file);
1888        $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1889        print FILE "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1890    }
1891    }
1892
1893    print FILE "</PagedDocument>\n";
1894    closedir DIR;
1895    return $item_file;
1896}
1897
18981;
Note: See TracBrowser for help on using the browser.