root/main/trunk/greenstone2/perllib/util.pm @ 28375

Revision 28375, 50.1 KB (checked in by davidb, 7 years ago)

A set of changes to help Greenstone building code (perl) run under Cygwin. The test is designed to be mutually to when run natively on Windows. In effect the refined test is saying: if you're windows but not cygwin then do as you used to do for Windows, otherwise go with Unix (as Cygwin is effectively giving you a Unix like operating system to run in)

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29use FileUtils;
30
31use Encode;
32use File::Copy;
33use File::Basename;
34# Config for getting the perlpath in the recommended way, though it uses paths that are
35# hard-coded into the Config file that's generated upon configuring and compiling perl.
36# $^X works better in some cases to return the path to perl used to launch the script,
37# but if launched with plain "perl" (no full-path), that will be just what it returns.
38use Config;
39# New module for file related utility functions - intended as a
40# placeholder for an extension that allows a variety of different
41# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
42use FileUtils;
43
44if ($ENV{'GSDLOS'} =~ /^windows$/i) {
45    require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
46}
47
48# removes files (but not directories)
49sub rm {
50  warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
51  return &FileUtils::removeFiles(@_);
52}
53
54# recursive removal
55sub filtered_rm_r {
56  warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
57  return &FileUtils::removeFilesFiltered(@_);
58}
59
60# recursive removal
61sub rm_r {
62  warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
63  return &FileUtils::removeFilesRecursive(@_);
64}
65
66# moves a file or a group of files
67sub mv {
68  warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
69  return &FileUtils::moveFiles(@_);
70}
71
72# Move the contents of source directory into target directory
73# (as opposed to merely replacing target dir with the src dir)
74# This can overwrite any files with duplicate names in the target
75# but other files and folders in the target will continue to exist
76sub mv_dir_contents {
77  warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
78  return &FileUtils::moveDirectoryContents(@_);
79}
80
81# copies a file or a group of files
82sub cp {
83  warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
84  return &FileUtils::copyFiles(@_);
85}
86
87# recursively copies a file or group of files
88# syntax: cp_r (sourcefiles, destination directory)
89# destination must be a directory - to copy one file to
90# another use cp instead
91sub cp_r {
92  warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
93  return &FileUtils::copyFilesRecursive(@_);
94}
95
96# recursively copies a file or group of files
97# syntax: cp_r (sourcefiles, destination directory)
98# destination must be a directory - to copy one file to
99# another use cp instead
100sub cp_r_nosvn {
101  warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
102  return &FileUtils::copyFilesRecursiveNoSVN(@_);
103}
104
105# copies a directory and its contents, excluding subdirectories, into a new directory
106sub cp_r_toplevel {
107  warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
108  return &FileUtils::recursiveCopyTopLevel(@_);
109}
110
111sub mk_dir {
112  warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
113  return &FileUtils::makeDirectory(@_);
114}
115
116# in case anyone cares - I did some testing (using perls Benchmark module)
117# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
118# slightly faster (surprisingly) - Stefan.
119sub mk_all_dir {
120  warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
121  return &FileUtils::makeAllDirectories(@_);
122}
123
124# make hard link to file if supported by OS, otherwise copy the file
125sub hard_link {
126  warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
127  return &FileUtils::hardLink(@_);
128}
129
130# make soft link to file if supported by OS, otherwise copy file
131sub soft_link {
132  warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
133  return &FileUtils::softLink(@_);
134}
135
136# Primarily for filenames generated by processing
137# content of HTML files (which are mapped to UTF-8 internally)
138#
139# To turn this into an octet string that really exists on the file
140# system:
141# 1. don't need to do anything special for Unix-based systems
142#   (as underlying file system is byte-code)
143# 2. need to map to short DOS filenames for Windows
144
145sub utf8_to_real_filename
146{
147    my ($utf8_filename) = @_;
148
149    my $real_filename;
150
151    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
152    require Win32;
153
154    print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
155
156    my $unicode_filename = decode("utf8",$utf8_filename);
157    $real_filename = Win32::GetShortPathName($unicode_filename);
158    }
159    else {
160    $real_filename = $utf8_filename;
161    }
162
163    return $real_filename;
164}
165
166sub fd_exists {
167  warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
168  return &FileUtils::fileTest(@_);
169}
170
171sub file_exists {
172  warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
173  return &FileUtils::fileExists(@_);
174}
175
176sub dir_exists {
177  warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
178  return &FileUtils::directoryExists(@_);
179}
180
181# updates a copy of a directory in some other part of the filesystem
182# verbosity settings are: 0=low, 1=normal, 2=high
183# both $fromdir and $todir should be absolute paths
184sub cachedir {
185  warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
186  return &FileUtils::synchronizeDirectories(@_);
187}
188
189# this function returns -1 if either file is not found
190# assumes that $file1 and $file2 are absolute file names or
191# in the current directory
192# $file2 is allowed to be newer than $file1
193sub differentfiles {
194  warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
195  return &FileUtils::differentFiles(@_);
196}
197
198
199sub get_tmp_filename
200{
201    my $file_ext = shift(@_) || undef;
202
203    my $opt_dot_file_ext = "";
204    if (defined $file_ext) {
205    if ($file_ext !~ m/\./) {
206        # no dot, so needs one added in at start
207        $opt_dot_file_ext = ".$file_ext"
208    }
209    else {
210        # allow for "extensions" such as _metadata.txt to be handled
211        # gracefully
212        $opt_dot_file_ext = $file_ext;
213    }
214    }
215
216    my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
217    &FileUtils::makeAllDirectories ($tmpdir) unless -e $tmpdir;
218
219    my $count = 1000;
220    my $rand = int(rand $count);
221    my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
222
223    while (-e $full_tmp_filename) {
224    $rand = int(rand $count);
225    $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
226    $count++;
227    }
228   
229    return $full_tmp_filename;
230}
231
232# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
233# belong with that function. They help ensure the timestamped tmp folders generated are unique.
234my $previous_timestamp = undef;
235my $previous_timestamp_f = 0; # frequency
236
237sub get_timestamped_tmp_folder
238{
239   
240    my $tmp_dirname;
241    if(defined $ENV{'GSDLCOLLECTDIR'}) {
242    $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
243    } elsif(defined $ENV{'GSDLHOME'}) {
244    $tmp_dirname = $ENV{'GSDLHOME'};
245    } else {
246    return undef;
247    }
248
249    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
250    &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
251
252    # add the timestamp into the path otherwise we can run into problems
253    # if documents have the same name
254    my $timestamp = time;   
255   
256    if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
257        $previous_timestamp_f = 0;
258        $previous_timestamp = $timestamp;
259    } else {
260        $previous_timestamp_f++;
261    }
262   
263    my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
264    $tmp_dirname = $time_tmp_dirname;   
265    my $i = $previous_timestamp_f;
266   
267    if($previous_timestamp_f > 0) {
268        $tmp_dirname = $time_tmp_dirname."_".$i;
269        $i++;
270    }
271    while (-e $tmp_dirname) {
272    $tmp_dirname = $time_tmp_dirname."_".$i;
273    $i++;
274    }
275    &FileUtils::makeDirectory($tmp_dirname);
276
277    return $tmp_dirname;
278}
279
280sub get_timestamped_tmp_filename_in_collection
281{
282
283    my ($input_filename, $output_ext) = @_;
284    # derive tmp filename from input filename
285    my ($tailname, $dirname, $suffix)
286    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
287
288    # softlink to collection tmp dir
289    my $tmp_dirname = &util::get_timestamped_tmp_folder();
290    $tmp_dirname = $dirname unless defined $tmp_dirname;
291
292    # following two steps copied from ConvertBinaryFile
293    # do we need them?? can't use them as is, as they use plugin methods.
294
295    #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
296
297    # URLEncode this since htmls with images where the html filename is utf8 don't seem
298    # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
299    # files on the filesystem.
300    #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
301    if (defined $output_ext) {
302    $output_ext = ".$output_ext"; # add the dot
303    } else {
304    $output_ext = $suffix;
305    }
306    $output_ext= lc($output_ext);
307    my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
308   
309    return $tmp_filename;
310}
311
312sub get_toplevel_tmp_dir
313{
314    return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
315}
316
317
318sub filename_to_regex {
319    my $filename = shift (@_);
320
321    # need to make single backslashes double so that regex works
322    $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);   
323   
324    # note that the first part of a substitution is a regex, so RE chars need to be escaped,
325    # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
326    $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
327    $filename =~ s@\(@\\(@g; # escape brackets
328    $filename =~ s@\)@\\)@g; # escape brackets
329    $filename =~ s@\[@\\[@g; # escape brackets
330    $filename =~ s@\]@\\]@g; # escape brackets
331   
332    return $filename;
333}
334
335sub unregex_filename {
336    my $filename = shift (@_);
337
338    # need to put doubled backslashes for regex back to single
339    $filename =~ s/\\\./\./g; # remove RE syntax for .
340    $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
341    $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
342    $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
343    $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
344   
345    # \\ goes to \
346    # This is the last step in reverse mirroring the order of steps in filename_to_regex()
347    $filename =~ s/\\\\/\\/g; # remove RE syntax for \   
348    return $filename;
349}
350
351sub filename_cat {
352  # I've disabled this warning for now, as every Greenstone perl
353  # script seems to make use of this function and so you drown in a
354  # sea of deprecated warnings [jmt12]
355#  warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
356  return &FileUtils::filenameConcatenate(@_);
357}
358
359
360sub pathname_cat {
361    my $first_path = shift(@_);
362    my (@pathnames) = @_;
363
364    # If first_path is not null or empty, then add it back into the list
365    if (defined $first_path && $first_path =~ /\S/) {
366    unshift(@pathnames, $first_path);
367    }
368
369    my $join_char;
370    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
371    $join_char = ";";
372    } else {
373    $join_char = ":";
374    }
375
376    my $pathname = join($join_char, @pathnames);
377
378    # remove duplicate slashes
379    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
380    $pathname =~ s/[\\\/]+/\\/g;
381    } else {
382    $pathname =~ s/[\/]+/\//g;
383    # DB: want a pathname abc\de.html to remain like this
384    }
385
386    return $pathname;
387}
388
389
390sub tidy_up_oid {
391    my ($OID) = @_;
392    if ($OID =~ /\./) {
393    print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
394    $OID =~ s/\.//g; #remove any periods
395    }
396    if ($OID =~ /^\s.*\s$/) {
397    print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
398    # remove starting and trailing whitespace
399    $OID =~ s/^\s+//;
400    $OID =~ s/\s+$//;
401    }
402    if ($OID =~ /^[\d]*$/) {
403    print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
404    $OID = "D" . $OID;
405    }       
406   
407    return $OID;
408}
409
410sub envvar_prepend {
411    my ($var,$val) = @_;
412
413    # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
414##    my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
415
416    # Rewritten above to make ":" the default (Windows is the special
417    # case, anything else 'unusual' such as Solaris etc is Unix)
418    my $pathsep = (defined $ENV{'GSDLOS'} && (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin"))) ? ";" : ":";
419
420    # do not prepend any value/path that's already in the environment variable
421   
422    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
423    if (!defined($ENV{$var})) {
424    $ENV{$var} = "$val";
425    }
426    elsif($ENV{$var} !~ m/$escaped_val/) {
427    $ENV{$var} = "$val".$pathsep.$ENV{$var};
428    }
429}
430
431sub envvar_append {
432    my ($var,$val) = @_;
433
434    # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
435    my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
436   
437    # do not append any value/path that's already in the environment variable
438
439    my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
440    if (!defined($ENV{$var})) {
441    $ENV{$var} = "$val";
442    }
443    elsif($ENV{$var} !~ m/$escaped_val/) {
444    $ENV{$var} = $ENV{$var}.$pathsep."$val";
445    }
446}
447
448
449# splits a filename into a prefix and a tail extension using the tail_re, or
450# if that fails, splits on the file_extension . (dot)
451sub get_prefix_and_tail_by_regex {
452
453    my ($filename,$tail_re) = @_;
454   
455    my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
456    if ((!defined $file_prefix) || (!defined $file_ext)) {
457    ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
458    }
459
460    return ($file_prefix,$file_ext);
461}
462
463# get full path and file only path from a base_dir (which may be empty) and
464# file (which may contain directories)
465sub get_full_filenames {
466    my ($base_dir, $file) = @_;
467   
468#    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
469#    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
470#    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
471
472
473    my $filename_full_path = $file;
474    # add on directory if present
475    $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
476   
477    my $filename_no_path = $file;
478
479    # remove directory if present
480    $filename_no_path =~ s/^.*[\/\\]//;
481    return ($filename_full_path, $filename_no_path);
482}
483
484# returns the path of a file without the filename -- ie. the directory the file is in
485sub filename_head {
486    my $filename = shift(@_);
487
488    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
489    $filename =~ s/[^\\\\]*$//;
490    }
491    else {
492    $filename =~ s/[^\\\/]*$//;
493    }
494
495    return $filename;
496}
497
498
499
500# returns 1 if filename1 and filename2 point to the same
501# file or directory
502sub filenames_equal {
503    my ($filename1, $filename2) = @_;
504
505    # use filename_cat to clean up trailing slashes and
506    # multiple slashes
507    $filename1 = &FileUtils::filenameConcatenate($filename1);
508    $filename2 = &FileUtils::filenameConcatenate($filename2);
509
510    # filenames not case sensitive on windows
511    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
512    $filename1 =~ tr/[A-Z]/[a-z]/;
513    $filename2 =~ tr/[A-Z]/[a-z]/;
514    }
515    return 1 if $filename1 eq $filename2;
516    return 0;
517}
518
519# If filename is relative to within_dir, returns the relative path of filename to that directory
520# with slashes in the filename returned as they were in the original (absolute) filename.
521sub filename_within_directory
522{
523    my ($filename,$within_dir) = @_;
524   
525    if ($within_dir !~ m/[\/\\]$/) {
526    my $dirsep = &util::get_dirsep();
527    $within_dir .= $dirsep;
528    }
529   
530    $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets   
531    if ($filename =~ m/^$within_dir(.*)$/) {
532    $filename = $1;
533    }
534   
535    return $filename;
536}
537
538# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
539# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
540# The subpath returned will also be a URL type filename.
541sub filename_within_directory_url_format
542{
543    my ($filename,$within_dir) = @_;
544   
545    # convert parameters only to / slashes if Windows
546   
547    my $filename_urlformat = &filepath_to_url_format($filename);
548    my $within_dir_urlformat = &filepath_to_url_format($within_dir);
549
550    #if ($within_dir_urlformat !~ m/\/$/) {
551        # make sure directory ends with a slash
552        #$within_dir_urlformat .= "/";
553    #}
554   
555    my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
556   
557    #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
558   
559    # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
560    if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
561        $filename_urlformat = $1;
562    }
563   
564    return $filename_urlformat;
565}
566
567# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
568# since on Linux it doesn't represent a file separator but an escape char).
569sub filepath_to_url_format
570{
571    my ($filepath) = @_;
572    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
573        # Only need to worry about Windows, as Unix style directories already in url-format
574        # Convert Windows style \ => /
575        $filepath =~ s@\\@/@g;     
576    }
577    return $filepath;
578}
579
580# regex filepaths on windows may include \\ as path separator. Convert \\ to /
581sub filepath_regex_to_url_format
582{
583    my ($filepath) = @_;
584    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
585    # Only need to worry about Windows, as Unix style directories already in url-format
586    # Convert Windows style \\ => /
587    $filepath =~ s@\\\\@/@g;       
588    }
589    return $filepath;
590   
591}
592
593# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
594# and ignores trailing /
595# returns (file, dirs) dirs will be empty if no subdirs
596sub url_fileparse
597{
598    my ($filepath) = @_;
599    # remove trailing /
600    $filepath =~ s@/$@@;
601    if ($filepath !~ m@/@) {
602    return ($filepath, "");
603    }
604    my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
605    return ($file, $dirs);
606   
607}
608
609
610sub filename_within_collection
611{
612    my ($filename) = @_;
613
614    my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
615   
616    if (defined $collect_dir) {
617
618    # if from within GSDLCOLLECTDIR, then remove directory prefix
619    # so source_filename is realative to it.  This is done to aid
620    # portability, i.e. the collection can be moved to somewhere
621    # else on the file system and the archives directory will still
622    # work.  This is needed, for example in the applet version of
623    # GLI where GSDLHOME/collect on the server will be different to
624    # the collect directory of the remove user.  Of course,
625    # GSDLCOLLECTDIR subsequently needs to be put back on to turn
626    # it back into a full pathname.
627
628    $filename = filename_within_directory($filename,$collect_dir);
629    }
630   
631    return $filename;
632}
633
634sub prettyprint_file
635{
636    my ($base_dir,$file,$gli) = @_;
637
638    my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
639
640    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
641    require Win32;
642
643    # For some reason base_dir in the form c:/a/b/c
644    # This leads to confusion later on, so turn it back into
645    # the more usual Windows form
646    $base_dir =~ s/\//\\/g;
647    my $long_base_dir = Win32::GetLongPathName($base_dir);
648    my $long_full_path = Win32::GetLongPathName($filename_full_path);
649
650    $file = filename_within_directory($long_full_path,$long_base_dir);
651    $file = encode("utf8",$file) if ($gli);
652    }
653
654    return $file;
655}
656
657
658sub upgrade_if_dos_filename
659{
660    my ($filename_full_path,$and_encode) = @_;
661
662    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
663    # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
664    # to its long (Windows) version
665    my $long_filename = Win32::GetLongPathName($filename_full_path);
666    if (defined $long_filename) {
667        $filename_full_path = $long_filename;
668    }
669    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
670    $filename_full_path =~ s/^(.):/\u$1:/;
671    if ((defined $and_encode) && ($and_encode)) {
672        $filename_full_path = encode("utf8",$filename_full_path);
673    }
674    }
675
676    return $filename_full_path;
677}
678
679
680sub downgrade_if_dos_filename
681{
682    my ($filename_full_path) = @_;
683
684    if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
685    require Win32;
686
687    # Ensure the given long Windows filename is in a form that can
688    # be opened by Perl => convert it to a short DOS-like filename
689
690    my $short_filename = Win32::GetShortPathName($filename_full_path);
691    if (defined $short_filename) {
692        $filename_full_path = $short_filename;
693    }
694    # Make sure initial drive letter is lower-case (to fit in
695    # with rest of Greenstone)
696    $filename_full_path =~ s/^(.):/\u$1:/;
697    }
698
699    return $filename_full_path;
700}
701
702sub block_filename
703{
704    my ($block_hash,$filename) = @_;
705   
706    if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
707   
708    # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
709    my $lower_filename = lc($filename);
710    $block_hash->{'file_blocks'}->{$lower_filename} = 1;
711#   my $lower_drive = $filename;
712#   $lower_drive =~ s/^([A-Z]):/\l$1:/i;
713   
714#   my $upper_drive = $filename;
715#   $upper_drive =~ s/^([A-Z]):/\u$1:/i;
716#   
717#   $block_hash->{'file_blocks'}->{$lower_drive} = 1;
718#   $block_hash->{'file_blocks'}->{$upper_drive} = 1;       
719    }
720    else {
721    $block_hash->{'file_blocks'}->{$filename} = 1;
722    }
723}
724
725
726sub filename_is_absolute
727{
728  warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
729  return &FileUtils::isFilenameAbsolute(@_);
730}
731
732
733## @method make_absolute()
734#
735#  Ensure the given file path is absolute in respect to the given base path.
736#
737#  @param  $base_dir A string denoting the base path the given dir must be
738#                    absolute to.
739#  @param  $dir The directory to be made absolute as a string. Note that the
740#               dir may already be absolute, in which case it will remain
741#               unchanged.
742#  @return The now absolute form of the directory as a string.
743#
744#  @author John Thompson, DL Consulting Ltd.
745#  @copy 2006 DL Consulting Ltd.
746#
747#used in buildcol.pl, doesn't work for all cases --kjdon
748sub make_absolute {
749   
750    my ($base_dir, $dir) = @_;
751###    print STDERR "dir = $dir\n";
752    $dir =~ s/[\\\/]+/\//g;
753    $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
754    $dir =~ s|^/tmp_mnt||;
755    1 while($dir =~ s|/[^/]*/\.\./|/|g);
756    $dir =~ s|/[.][.]?/|/|g;
757    $dir =~ tr|/|/|s;
758###    print STDERR "dir = $dir\n";
759   
760    return $dir;
761}
762## make_absolute() ##
763
764sub get_dirsep {
765
766    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
767    return "\\";
768    } else {
769    return "\/";
770    }
771}
772
773sub get_os_dirsep {
774
775    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
776    return "\\\\";
777    } else {
778    return "\\\/";
779    }
780}
781
782sub get_re_dirsep {
783
784    return "\\\\|\\\/";
785}
786
787
788sub get_dirsep_tail {
789    my ($filename) = @_;
790   
791    # returns last part of directory or filename
792    # On unix e.g. a/b.d => b.d
793    #              a/b/c => c
794
795    my $dirsep = get_re_dirsep();
796    my @dirs = split (/$dirsep/, $filename);
797    my $tail = pop @dirs;
798
799    # - caused problems under windows
800    #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
801
802    return $tail;
803}
804
805
806# if this is running on windows we want binaries to end in
807# .exe, otherwise they don't have to end in any extension
808sub get_os_exe {
809    return ".exe" if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"));
810    return "";
811}
812
813
814# test to see whether this is a big or little endian machine
815sub is_little_endian
816{
817    # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
818    # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
819    # Otherwise, it's little endian
820
821    #return 0 if $^O =~ /^darwin$/i;
822    #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
823   
824    # Going back to stating exactly whether the machine is little endian
825    # or big endian, without any special case for Macs. Since for rata it comes
826    # back with little endian and for shuttle with bigendian.
827    return (ord(substr(pack("s",1), 0, 1)) == 1);
828}
829
830
831# will return the collection name if successful, "" otherwise
832sub use_collection {
833    my ($collection, $collectdir) = @_;
834
835    if (!defined $collectdir || $collectdir eq "") {
836    $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
837    }
838
839    if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
840     $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
841    }
842
843    # get and check the collection
844    if (!defined($collection) || $collection eq "") {
845    if (defined $ENV{'GSDLCOLLECTION'}) {
846        $collection = $ENV{'GSDLCOLLECTION'};
847    } else {
848        print STDOUT "No collection specified\n";
849        return "";
850    }
851    }
852   
853    if ($collection eq "modelcol") {
854    print STDOUT "You can't use modelcol.\n";
855    return "";
856    }
857
858    # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
859    # are defined
860    $ENV{'GSDLCOLLECTION'} = $collection;
861    $ENV{'GSDLCOLLECTHOME'} = $collectdir;
862    $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
863
864    # make sure this collection exists
865    if (!-e $ENV{'GSDLCOLLECTDIR'}) {
866    print STDOUT "Invalid collection ($collection).\n";
867    return "";
868    }
869
870    # everything is ready to go
871    return $collection;
872}
873
874sub get_current_collection_name {
875    return $ENV{'GSDLCOLLECTION'};
876}
877
878
879# will return the collection name if successful, "" otherwise. 
880# Like use_collection (above) but for greenstone 3 (taking account of site level)
881
882sub use_site_collection {
883    my ($site, $collection, $collectdir) = @_;
884
885    if (!defined $collectdir || $collectdir eq "") {
886    die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
887    $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
888    }
889
890    if (defined $ENV{'GSDL3HOME'}) {
891    $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};     
892    $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
893    } elsif (defined $ENV{'GSDL3SRCHOME'}) {
894    $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web"); 
895    $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
896    } else {
897    print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
898    }
899
900    # collectdir explicitly set by this point (using $site variable if required).
901    # Can call "old" gsdl2 use_collection now.
902
903    return use_collection($collection,$collectdir);
904}
905
906
907
908sub locate_config_file
909{
910    my ($file) = @_;
911
912    my $locations = locate_config_files($file);
913
914    return shift @$locations; # returns undef if 'locations' is empty
915}
916
917
918sub locate_config_files
919{
920    my ($file) = @_;
921
922    my @locations = ();
923
924    if (-e $file) {
925    # Clearly specified (most likely full filename)
926    # No need to hunt in 'etc' directories, return value unchanged
927    push(@locations,$file);
928    }
929    else {
930    # Check for collection specific one before looking in global GSDL 'etc'
931    if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
932        my $test_collect_etc_filename
933        = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
934       
935        if (-e $test_collect_etc_filename) {
936        push(@locations,$test_collect_etc_filename);
937        }
938    }
939    my $test_main_etc_filename
940        = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
941    if (-e $test_main_etc_filename) {
942        push(@locations,$test_main_etc_filename);
943    }
944    }
945
946    return \@locations;
947}
948
949
950sub hyperlink_text
951{
952    my ($text) = @_;
953   
954    $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
955    $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
956
957    return $text;
958}
959
960
961# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
962# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
963sub is_dir_empty {
964  warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
965  return &FileUtils::isDirectoryEmpty(@_);
966}
967
968# Returns the given filename converted using either URL encoding or base64
969# encoding, as specified by $rename_method. If the given filename has no suffix
970# (if it is just the tailname), then $no_suffix should be some defined value.
971# rename_method can be url, none, base64
972sub rename_file {
973    my ($filename, $rename_method, $no_suffix)  = @_;
974
975    if(!$filename) { # undefined or empty string
976    return $filename;
977    }
978
979    if (!$rename_method) {
980    print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
981    # Debugging information
982    # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
983    # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
984    $rename_method = "url";
985    } elsif($rename_method eq "none") {
986    return $filename; # would have already been renamed
987    }
988
989    # No longer replace spaces with underscores, since underscores mess with incremental rebuild
990    ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
991    ###$filename =~ s/ /_/g;
992
993    my ($tailname,$dirname,$suffix);
994    if($no_suffix) { # given a tailname, no suffix
995    ($tailname,$dirname) = File::Basename::fileparse($filename);
996    }
997    else {
998    ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
999    }
1000    if (!$suffix) {
1001    $suffix = "";
1002    }
1003    # This breaks GLI matching extracted metadata to files in Enrich panel, as
1004    # original is eg .JPG while gsdlsourcefilename ends up .jpg
1005    # Not sure why it was done in first place...
1006    #else {
1007    #$suffix = lc($suffix);
1008    #}
1009
1010    if ($rename_method eq "url") {
1011    $tailname = &unicode::url_encode($tailname);
1012    }
1013    elsif ($rename_method eq "base64") {
1014    $tailname = &unicode::base64_encode($tailname);
1015    $tailname =~ s/\s*//sg;      # for some reason it adds spaces not just at end but also in middle
1016    }
1017
1018    $filename = "$tailname$suffix";
1019    $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1020
1021    return $filename;
1022}
1023
1024
1025# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1026sub rename_ldb_or_bdb_file {
1027    my ($filename_no_ext) = @_;
1028
1029    my $new_filename = "$filename_no_ext.gdb";
1030    return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1031    # try ldb
1032    my $old_filename = "$filename_no_ext.ldb";
1033   
1034    if (-f $old_filename) {
1035    print STDERR "Renaming $old_filename to $new_filename\n";
1036    rename ($old_filename, $new_filename)
1037        || print STDERR "Rename failed: $!\n";
1038    return;
1039    }
1040    # try bdb
1041    $old_filename = "$filename_no_ext.bdb";
1042    if (-f $old_filename) {
1043    print STDERR "Renaming $old_filename to $new_filename\n";   
1044    rename ($old_filename, $new_filename)
1045        || print STDERR "Rename failed: $!\n";
1046    return;
1047    }
1048}
1049
1050sub os_dir() {
1051   
1052    my $gsdlarch = "";
1053    if(defined $ENV{'GSDLARCH'}) {
1054    $gsdlarch = $ENV{'GSDLARCH'};
1055    }
1056    return $ENV{'GSDLOS'}.$gsdlarch;
1057}
1058
1059# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1060# By default, /greenstone3 for GS3 or /greenstone for GS2.
1061sub get_greenstone_url_prefix() {
1062    # if already set on a previous occasion, just return that
1063    # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1064    return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1065
1066    my ($configfile, $urlprefix, $defaultUrlprefix);
1067    my @propertynames = ();
1068
1069    if($ENV{'GSDL3SRCHOME'}) {
1070    $defaultUrlprefix = "/greenstone3";
1071    $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1072    push(@propertynames, qw/path\s*\=/);
1073    } else {
1074    $defaultUrlprefix = "/greenstone";
1075    $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1076    push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1077    }
1078
1079    $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1080
1081    if(!$urlprefix) { # no values found for URL prefix, use default values
1082    $urlprefix = $defaultUrlprefix;
1083    } else {
1084    #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1085    $urlprefix =~ s/^\///; # remove the starting slash
1086    my @dirs = split(/(\\|\/)/, $urlprefix);
1087    $urlprefix = shift(@dirs);
1088
1089    if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1090        $urlprefix = "/$urlprefix";
1091    }
1092    }
1093
1094    # set for the future
1095    $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1096#    print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1097    return $urlprefix;
1098}
1099
1100
1101# Given a config file (xml or java properties file) and a list/array of regular expressions
1102# that represent property names to match on, this function will return the value for the 1st
1103# matching property name. If the return value is undefined, no matching property was found.
1104sub extract_propvalue_from_file() {
1105    my ($configfile, $propertynames) = @_;
1106
1107    my $value;
1108    unless(open(FIN, "<$configfile")) {
1109    print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1110    return $value; # not initialised
1111    }
1112
1113    # Read the entire file at once, as one single line, then close it
1114    my $filecontents;
1115    {
1116    local $/ = undef;       
1117    $filecontents = <FIN>;
1118    }
1119    close(FIN);
1120
1121    foreach my $regex (@$propertynames) {
1122        ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1123    if($value) {
1124            $value =~ s/^\"//;     # remove any startquotes
1125        $value =~ s/\".*$//;   # remove the 1st endquotes (if any) followed by any xml
1126        last;              # found value for a matching property, break from loop
1127    }
1128    }
1129
1130    return $value;
1131}
1132
1133# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1134# given that perllib is in @INC in order to invoke this subroutine.
1135# Call as follows -- after setting up INC to include perllib and
1136# after setting up GSDLHOME and GSDLOS:
1137#
1138# require util;
1139# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1140#
1141sub setup_greenstone_env() {
1142    my ($GSDLHOME, $GSDLOS) = @_;
1143
1144    #my %env_map = ();
1145    # Get the localised ENV settings of running a localised source setup.bash
1146    # and put it into the ENV here. Need to clear GSDLHOME before running setup
1147    #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1148    my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";     
1149    if (($GSDLOS =~ m/windows/i) && ($^O ne "cygwin"))  {
1150        #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1151        $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1152    }
1153    if (!open(PIN, "$perl_command |")) {
1154        print STDERR ("Unable to execute command: $perl_command. $!\n");
1155    }
1156
1157    while (defined (my $perl_output_line = <PIN>)) {
1158        my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1159        #$env_map{$key}=$value;     
1160        $ENV{$key}=$value;
1161    }
1162    close (PIN);
1163
1164    # If any keys in $ENV don't occur in Greenstone's localised env
1165    # (stored in $env_map), delete those entries from $ENV
1166    #foreach $key (keys %ENV) {
1167    #   if(!defined $env_map{$key}) {
1168    #       print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1169    #       delete $ENV{$key}; # del $ENV(key, value) pair
1170    #   }
1171    #}
1172    #undef %env_map;
1173}
1174
1175sub get_perl_exec() {   
1176    my $perl_exec = $^X; # may return just "perl"
1177   
1178    if($ENV{'PERLPATH'}) {
1179        # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1180        if (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin")) {
1181            $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1182        } else {
1183            $perl_exec = "$ENV{'PERLPATH'}/perl";
1184        }
1185    } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1186        # containing the full path to the current perl executable we're using
1187        $perl_exec = $Config{perlpath}; # configured path for perl
1188        if (!-e $perl_exec) { # may not point to location on this machine
1189            $perl_exec = $^X; # may return just "perl"
1190            if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1191                print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";             
1192            }
1193        }
1194    }
1195   
1196    return $perl_exec;
1197}
1198
1199# returns the path to the java command in the JRE included with GS (if any),
1200# quoted to safeguard any spaces in this path, otherwise a simple java
1201# command is returned which assumes and will try for a system java.
1202sub get_java_command {
1203    my $java = "java";
1204    if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1205                               # after running setup.bat or from GLI which also runs setup.bat
1206    my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1207    if(-d $java_bin) {
1208        $java = &FileUtils::filenameConcatenate($java_bin,"java");
1209        $java = "\"".$java."\""; # quoted to preserve spaces in path
1210    }
1211    }
1212    return $java;
1213}
1214
1215
1216# Given the qualified collection name (colgroup/collection),
1217# returns the collection and colgroup parts
1218sub get_collection_parts {
1219    # http://perldoc.perl.org/File/Basename.html
1220    # my($filename, $directories, $suffix) = fileparse($path);
1221    # "$directories contains everything up to and including the last directory separator in the $path
1222    # including the volume (if applicable). The remainder of the $path is the $filename."
1223    #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);   
1224
1225    my $qualified_collection = shift(@_);
1226
1227    # Since activate.pl can be launched from the command-line, including by a user,
1228    # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1229    # Also allow for the accidental inclusion of multiple slashes
1230    my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1231   
1232    if(!defined $collection) {
1233        $collection = $colgroup;
1234        $colgroup = "";
1235    }
1236    return ($collection, $colgroup);
1237}
1238
1239# work out the "collectdir/collection" location
1240sub resolve_collection_dir {
1241    my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1242   
1243    if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1244        return $ENV{'GSDLCOLLECTDIR'};
1245    }
1246
1247    my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);   
1248   
1249    if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1250        $collect_dir = &util::get_working_collect_dir($site);
1251    }
1252
1253    return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1254}
1255
1256# work out the full path to "collect" of this greenstone 2/3 installation
1257sub get_working_collect_dir {
1258    my ($site) = @_;   
1259
1260    if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1261    return $ENV{'GSDLCOLLECTHOME'};
1262    }
1263
1264    if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1265   
1266    if (defined $ENV{'GSDL3HOME'}) {
1267        return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1268    }
1269    elsif (defined $ENV{'GSDL3SRCHOME'}) {
1270        return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1271    }
1272    }
1273
1274    elsif (defined $ENV{'SITEHOME'}) {
1275    return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1276    }
1277   
1278    else { # get default collect dir for GS2
1279    return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1280    }
1281}
1282
1283sub is_abs_path_any_os {
1284    my ($path) = @_;
1285
1286    # We can have filenames in our DBs that were produced on other OS, so this method exists
1287    # to help identify absolute paths in such cases.
1288
1289    return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1290    return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1291
1292    return 0;
1293}
1294
1295
1296# This subroutine is for improving portability of Greenstone collections from one OS to another,
1297# to be used to convert absolute paths going into db files into paths with placeholders instead.
1298# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1299# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1300# path with a placeholder to match.
1301# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1302# the path to the current (specific) collection, the path to the general GS collect directory,
1303# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1304# The longest matching prefix will be replaced with the equivalent placeholder:
1305# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1306sub abspath_to_placeholders {
1307    my $path = shift(@_); # path to convert from absolute to one with placeholders
1308    my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1309
1310    return $path unless is_abs_path_any_os($path); # path is relative
1311
1312    if ($opt_long_or_short_winfilenames eq "long") {
1313    $path = &util::upgrade_if_dos_filename($path); # will only do something on windows
1314    }
1315   
1316    # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1317    my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $ENV{'SITEHOME'}, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1318
1319    my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1320               $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1321               $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1322    );
1323    $placeholder_map{$ENV{'SITEHOME'}} = '@SITEHOME@' if defined $ENV{'SITEHOME'};
1324
1325    $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1326
1327    if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1328    # for windows need to look for matches on short file names too
1329    # matched paths are again to be replaced with the usual placeholders
1330
1331    my $gsdlcollectdir = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'});
1332    my $gsdlcollecthome = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'});
1333    my $sitehome = (defined $ENV{'SITEHOME'}) ? &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) : undef;
1334    my $greenstonehome =  &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'});
1335
1336    @gs_paths = ($gsdlcollectdir, $gsdlcollecthome, $sitehome, $greenstonehome); # order matters
1337
1338    %placeholder_map = ($greenstonehome => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1339                $gsdlcollecthome => '@COLLECTHOME@',
1340                $gsdlcollectdir => '@THISCOLLECTPATH@'
1341        );
1342    $placeholder_map{$sitehome} = '@SITEHOME@' if defined $sitehome;
1343
1344    $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1345    }
1346
1347    return $path;
1348}
1349
1350sub _abspath_to_placeholders {
1351    my ($path, $gs_paths_ref, $placeholder_map_ref) = @_;
1352
1353    # The sequence of elements in @gs_paths matters   
1354    # Need to loop starting from the *longest* matching path (the path to the specific collection)
1355    # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1356
1357    foreach my $gs_path (@$gs_paths_ref) {
1358    next if(!defined $gs_path); # site undefined for GS2
1359
1360    my $re_path =  &util::filename_to_regex($gs_path); # escape for regex
1361
1362    if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1363
1364        my $placeholder = $placeholder_map_ref->{$gs_path}; # get the placeholder to replace the matched path with
1365
1366        $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1367        #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1368        last; # done
1369    }
1370    }
1371   
1372    return $path;
1373}
1374
1375# Function that does the reverse of the util::abspath_to_placeholders subroutine
1376# Once again, call this with the values returned from util::get_common_gs_paths
1377sub placeholders_to_abspath {
1378    my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1379    my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1380
1381    return $path if($path !~ m/@/); # path contains no placeholders
1382   
1383    # replace placeholders with gs prefixes
1384    my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1385                       # but listed here from longest to shortest once placeholders are have been resolved
1386
1387    # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1388    my %placeholder_to_gspath_map;
1389    if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1390    # always replace placeholders with short file names of the absolute paths on windows?
1391    %placeholder_to_gspath_map = ('@GSDLHOME@' => &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'}),
1392                     '@COLLECTHOME@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'}),
1393                     '@THISCOLLECTPATH@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'})
1394    );
1395    $placeholder_to_gspath_map{'@SITEHOME@'} =  &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) if defined $ENV{'SITEHOME'};
1396    } else {
1397    %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'},
1398                      '@SITEHOME@' => $ENV{'SITEHOME'}, # can be undef
1399                      '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1400                      '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1401        ); # $placeholder_to_gspath_map{'@SITEHOME@'} = $ENV{'SITEHOME'} if defined $ENV{'SITEHOME'};
1402    }
1403
1404    foreach my $placeholder (@placeholders) {
1405    my $gs_path = $placeholder_to_gspath_map{$placeholder};
1406
1407    next if(!defined $gs_path); # sitehome for GS2 is undefined
1408
1409    if($path =~ m/^$placeholder/) {
1410        $path =~ s/^$placeholder/$gs_path/;
1411        last; # done
1412    }
1413    }
1414   
1415    return $path;
1416}
1417
1418# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1419# a directory containing sequentially numbered images.
1420sub create_itemfile
1421{
1422    my ($output_dir, $convert_basename, $convert_to) = @_;
1423    my $page_num = "";
1424
1425    opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1426    my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1427    closedir DIR;
1428
1429    # Sort files in the directory by page_num
1430    sub page_number {
1431    my ($dir) = @_;
1432    my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1433#   my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1434
1435    $pagenum = 1 unless defined $pagenum;
1436    return $pagenum;
1437    }
1438
1439    # sort the files in the directory in the order of page_num rather than lexically.
1440    @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1441
1442    # work out if the numbering of the now sorted image files starts at 0 or not
1443    # by checking the number of the first _image_ file (skipping item files)
1444    my $starts_at_0 = 0;
1445    my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1446    if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1447    $starts_at_0 = 1;
1448    }
1449
1450    my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1451    my $item_fh;
1452    &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1453    print $item_fh "<PagedDocument>\n";
1454
1455    foreach my $file (@dir_files){
1456    if ($file !~ /\.item/i){
1457        $page_num = page_number($file);
1458        $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1459        print $item_fh "   <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1460    }
1461    }
1462
1463    print $item_fh "</PagedDocument>\n";
1464    &FileUtils::closeFileHandle($item_file, \$item_fh);
1465    return $item_file;
1466}
1467
1468
1469## @function augmentINC()
1470#
1471#  Prepend a path (if it exists) onto INC but only if it isn't already in INC
1472#  @param $new_path The path to add
1473#  @author jmt12
1474#
1475sub augmentINC
1476{
1477  my ($new_path) = @_;
1478  my $did_add_path = 0;
1479  # might need to be replaced with FileUtils::directoryExists() call eventually
1480  if (-d $new_path)
1481  {
1482    my $did_find_path = 0;
1483    foreach my $existing_path (@INC)
1484    {
1485      if ($existing_path eq $new_path)
1486      {
1487        $did_find_path = 1;
1488        last;
1489      }
1490    }
1491    if (!$did_find_path)
1492    {
1493      unshift(@INC, $new_path);
1494      $did_add_path = 1;
1495    }
1496  }
1497  return $did_add_path;
1498}
1499## augmentINC()
1500
1501
15021;
Note: See TracBrowser for help on using the browser.