source: main/trunk/greenstone2/perllib/util.pm@ 28375

Last change on this file since 28375 was 28375, checked in by davidb, 11 years ago

A set of changes to help Greenstone building code (perl) run under Cygwin. The test is designed to be mutually to when run natively on Windows. In effect the refined test is saying: if you're windows but not cygwin then do as you used to do for Windows, otherwise go with Unix (as Cygwin is effectively giving you a Unix like operating system to run in)

  • Property svn:keywords set to Author Date Id Revision
File size: 50.1 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29use FileUtils;
30
31use Encode;
32use File::Copy;
33use File::Basename;
34# Config for getting the perlpath in the recommended way, though it uses paths that are
35# hard-coded into the Config file that's generated upon configuring and compiling perl.
36# $^X works better in some cases to return the path to perl used to launch the script,
37# but if launched with plain "perl" (no full-path), that will be just what it returns.
38use Config;
39# New module for file related utility functions - intended as a
40# placeholder for an extension that allows a variety of different
41# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
42use FileUtils;
43
44if ($ENV{'GSDLOS'} =~ /^windows$/i) {
45 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
46}
47
48# removes files (but not directories)
49sub rm {
50 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
51 return &FileUtils::removeFiles(@_);
52}
53
54# recursive removal
55sub filtered_rm_r {
56 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
57 return &FileUtils::removeFilesFiltered(@_);
58}
59
60# recursive removal
61sub rm_r {
62 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
63 return &FileUtils::removeFilesRecursive(@_);
64}
65
66# moves a file or a group of files
67sub mv {
68 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
69 return &FileUtils::moveFiles(@_);
70}
71
72# Move the contents of source directory into target directory
73# (as opposed to merely replacing target dir with the src dir)
74# This can overwrite any files with duplicate names in the target
75# but other files and folders in the target will continue to exist
76sub mv_dir_contents {
77 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
78 return &FileUtils::moveDirectoryContents(@_);
79}
80
81# copies a file or a group of files
82sub cp {
83 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
84 return &FileUtils::copyFiles(@_);
85}
86
87# recursively copies a file or group of files
88# syntax: cp_r (sourcefiles, destination directory)
89# destination must be a directory - to copy one file to
90# another use cp instead
91sub cp_r {
92 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
93 return &FileUtils::copyFilesRecursive(@_);
94}
95
96# recursively copies a file or group of files
97# syntax: cp_r (sourcefiles, destination directory)
98# destination must be a directory - to copy one file to
99# another use cp instead
100sub cp_r_nosvn {
101 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
102 return &FileUtils::copyFilesRecursiveNoSVN(@_);
103}
104
105# copies a directory and its contents, excluding subdirectories, into a new directory
106sub cp_r_toplevel {
107 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
108 return &FileUtils::recursiveCopyTopLevel(@_);
109}
110
111sub mk_dir {
112 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
113 return &FileUtils::makeDirectory(@_);
114}
115
116# in case anyone cares - I did some testing (using perls Benchmark module)
117# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
118# slightly faster (surprisingly) - Stefan.
119sub mk_all_dir {
120 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
121 return &FileUtils::makeAllDirectories(@_);
122}
123
124# make hard link to file if supported by OS, otherwise copy the file
125sub hard_link {
126 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
127 return &FileUtils::hardLink(@_);
128}
129
130# make soft link to file if supported by OS, otherwise copy file
131sub soft_link {
132 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
133 return &FileUtils::softLink(@_);
134}
135
136# Primarily for filenames generated by processing
137# content of HTML files (which are mapped to UTF-8 internally)
138#
139# To turn this into an octet string that really exists on the file
140# system:
141# 1. don't need to do anything special for Unix-based systems
142# (as underlying file system is byte-code)
143# 2. need to map to short DOS filenames for Windows
144
145sub utf8_to_real_filename
146{
147 my ($utf8_filename) = @_;
148
149 my $real_filename;
150
151 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
152 require Win32;
153
154 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
155
156 my $unicode_filename = decode("utf8",$utf8_filename);
157 $real_filename = Win32::GetShortPathName($unicode_filename);
158 }
159 else {
160 $real_filename = $utf8_filename;
161 }
162
163 return $real_filename;
164}
165
166sub fd_exists {
167 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
168 return &FileUtils::fileTest(@_);
169}
170
171sub file_exists {
172 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
173 return &FileUtils::fileExists(@_);
174}
175
176sub dir_exists {
177 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
178 return &FileUtils::directoryExists(@_);
179}
180
181# updates a copy of a directory in some other part of the filesystem
182# verbosity settings are: 0=low, 1=normal, 2=high
183# both $fromdir and $todir should be absolute paths
184sub cachedir {
185 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
186 return &FileUtils::synchronizeDirectories(@_);
187}
188
189# this function returns -1 if either file is not found
190# assumes that $file1 and $file2 are absolute file names or
191# in the current directory
192# $file2 is allowed to be newer than $file1
193sub differentfiles {
194 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
195 return &FileUtils::differentFiles(@_);
196}
197
198
199sub get_tmp_filename
200{
201 my $file_ext = shift(@_) || undef;
202
203 my $opt_dot_file_ext = "";
204 if (defined $file_ext) {
205 if ($file_ext !~ m/\./) {
206 # no dot, so needs one added in at start
207 $opt_dot_file_ext = ".$file_ext"
208 }
209 else {
210 # allow for "extensions" such as _metadata.txt to be handled
211 # gracefully
212 $opt_dot_file_ext = $file_ext;
213 }
214 }
215
216 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
217 &FileUtils::makeAllDirectories ($tmpdir) unless -e $tmpdir;
218
219 my $count = 1000;
220 my $rand = int(rand $count);
221 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
222
223 while (-e $full_tmp_filename) {
224 $rand = int(rand $count);
225 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
226 $count++;
227 }
228
229 return $full_tmp_filename;
230}
231
232# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
233# belong with that function. They help ensure the timestamped tmp folders generated are unique.
234my $previous_timestamp = undef;
235my $previous_timestamp_f = 0; # frequency
236
237sub get_timestamped_tmp_folder
238{
239
240 my $tmp_dirname;
241 if(defined $ENV{'GSDLCOLLECTDIR'}) {
242 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
243 } elsif(defined $ENV{'GSDLHOME'}) {
244 $tmp_dirname = $ENV{'GSDLHOME'};
245 } else {
246 return undef;
247 }
248
249 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
250 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
251
252 # add the timestamp into the path otherwise we can run into problems
253 # if documents have the same name
254 my $timestamp = time;
255
256 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
257 $previous_timestamp_f = 0;
258 $previous_timestamp = $timestamp;
259 } else {
260 $previous_timestamp_f++;
261 }
262
263 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
264 $tmp_dirname = $time_tmp_dirname;
265 my $i = $previous_timestamp_f;
266
267 if($previous_timestamp_f > 0) {
268 $tmp_dirname = $time_tmp_dirname."_".$i;
269 $i++;
270 }
271 while (-e $tmp_dirname) {
272 $tmp_dirname = $time_tmp_dirname."_".$i;
273 $i++;
274 }
275 &FileUtils::makeDirectory($tmp_dirname);
276
277 return $tmp_dirname;
278}
279
280sub get_timestamped_tmp_filename_in_collection
281{
282
283 my ($input_filename, $output_ext) = @_;
284 # derive tmp filename from input filename
285 my ($tailname, $dirname, $suffix)
286 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
287
288 # softlink to collection tmp dir
289 my $tmp_dirname = &util::get_timestamped_tmp_folder();
290 $tmp_dirname = $dirname unless defined $tmp_dirname;
291
292 # following two steps copied from ConvertBinaryFile
293 # do we need them?? can't use them as is, as they use plugin methods.
294
295 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
296
297 # URLEncode this since htmls with images where the html filename is utf8 don't seem
298 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
299 # files on the filesystem.
300 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
301 if (defined $output_ext) {
302 $output_ext = ".$output_ext"; # add the dot
303 } else {
304 $output_ext = $suffix;
305 }
306 $output_ext= lc($output_ext);
307 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
308
309 return $tmp_filename;
310}
311
312sub get_toplevel_tmp_dir
313{
314 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
315}
316
317
318sub filename_to_regex {
319 my $filename = shift (@_);
320
321 # need to make single backslashes double so that regex works
322 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
323
324 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
325 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
326 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
327 $filename =~ s@\(@\\(@g; # escape brackets
328 $filename =~ s@\)@\\)@g; # escape brackets
329 $filename =~ s@\[@\\[@g; # escape brackets
330 $filename =~ s@\]@\\]@g; # escape brackets
331
332 return $filename;
333}
334
335sub unregex_filename {
336 my $filename = shift (@_);
337
338 # need to put doubled backslashes for regex back to single
339 $filename =~ s/\\\./\./g; # remove RE syntax for .
340 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
341 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
342 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
343 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
344
345 # \\ goes to \
346 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
347 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
348 return $filename;
349}
350
351sub filename_cat {
352 # I've disabled this warning for now, as every Greenstone perl
353 # script seems to make use of this function and so you drown in a
354 # sea of deprecated warnings [jmt12]
355# warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
356 return &FileUtils::filenameConcatenate(@_);
357}
358
359
360sub pathname_cat {
361 my $first_path = shift(@_);
362 my (@pathnames) = @_;
363
364 # If first_path is not null or empty, then add it back into the list
365 if (defined $first_path && $first_path =~ /\S/) {
366 unshift(@pathnames, $first_path);
367 }
368
369 my $join_char;
370 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
371 $join_char = ";";
372 } else {
373 $join_char = ":";
374 }
375
376 my $pathname = join($join_char, @pathnames);
377
378 # remove duplicate slashes
379 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
380 $pathname =~ s/[\\\/]+/\\/g;
381 } else {
382 $pathname =~ s/[\/]+/\//g;
383 # DB: want a pathname abc\de.html to remain like this
384 }
385
386 return $pathname;
387}
388
389
390sub tidy_up_oid {
391 my ($OID) = @_;
392 if ($OID =~ /\./) {
393 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
394 $OID =~ s/\.//g; #remove any periods
395 }
396 if ($OID =~ /^\s.*\s$/) {
397 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
398 # remove starting and trailing whitespace
399 $OID =~ s/^\s+//;
400 $OID =~ s/\s+$//;
401 }
402 if ($OID =~ /^[\d]*$/) {
403 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
404 $OID = "D" . $OID;
405 }
406
407 return $OID;
408}
409
410sub envvar_prepend {
411 my ($var,$val) = @_;
412
413 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
414## my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
415
416 # Rewritten above to make ":" the default (Windows is the special
417 # case, anything else 'unusual' such as Solaris etc is Unix)
418 my $pathsep = (defined $ENV{'GSDLOS'} && (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin"))) ? ";" : ":";
419
420 # do not prepend any value/path that's already in the environment variable
421
422 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
423 if (!defined($ENV{$var})) {
424 $ENV{$var} = "$val";
425 }
426 elsif($ENV{$var} !~ m/$escaped_val/) {
427 $ENV{$var} = "$val".$pathsep.$ENV{$var};
428 }
429}
430
431sub envvar_append {
432 my ($var,$val) = @_;
433
434 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
435 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
436
437 # do not append any value/path that's already in the environment variable
438
439 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
440 if (!defined($ENV{$var})) {
441 $ENV{$var} = "$val";
442 }
443 elsif($ENV{$var} !~ m/$escaped_val/) {
444 $ENV{$var} = $ENV{$var}.$pathsep."$val";
445 }
446}
447
448
449# splits a filename into a prefix and a tail extension using the tail_re, or
450# if that fails, splits on the file_extension . (dot)
451sub get_prefix_and_tail_by_regex {
452
453 my ($filename,$tail_re) = @_;
454
455 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
456 if ((!defined $file_prefix) || (!defined $file_ext)) {
457 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
458 }
459
460 return ($file_prefix,$file_ext);
461}
462
463# get full path and file only path from a base_dir (which may be empty) and
464# file (which may contain directories)
465sub get_full_filenames {
466 my ($base_dir, $file) = @_;
467
468# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
469# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
470# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
471
472
473 my $filename_full_path = $file;
474 # add on directory if present
475 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
476
477 my $filename_no_path = $file;
478
479 # remove directory if present
480 $filename_no_path =~ s/^.*[\/\\]//;
481 return ($filename_full_path, $filename_no_path);
482}
483
484# returns the path of a file without the filename -- ie. the directory the file is in
485sub filename_head {
486 my $filename = shift(@_);
487
488 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
489 $filename =~ s/[^\\\\]*$//;
490 }
491 else {
492 $filename =~ s/[^\\\/]*$//;
493 }
494
495 return $filename;
496}
497
498
499
500# returns 1 if filename1 and filename2 point to the same
501# file or directory
502sub filenames_equal {
503 my ($filename1, $filename2) = @_;
504
505 # use filename_cat to clean up trailing slashes and
506 # multiple slashes
507 $filename1 = &FileUtils::filenameConcatenate($filename1);
508 $filename2 = &FileUtils::filenameConcatenate($filename2);
509
510 # filenames not case sensitive on windows
511 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
512 $filename1 =~ tr/[A-Z]/[a-z]/;
513 $filename2 =~ tr/[A-Z]/[a-z]/;
514 }
515 return 1 if $filename1 eq $filename2;
516 return 0;
517}
518
519# If filename is relative to within_dir, returns the relative path of filename to that directory
520# with slashes in the filename returned as they were in the original (absolute) filename.
521sub filename_within_directory
522{
523 my ($filename,$within_dir) = @_;
524
525 if ($within_dir !~ m/[\/\\]$/) {
526 my $dirsep = &util::get_dirsep();
527 $within_dir .= $dirsep;
528 }
529
530 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
531 if ($filename =~ m/^$within_dir(.*)$/) {
532 $filename = $1;
533 }
534
535 return $filename;
536}
537
538# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
539# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
540# The subpath returned will also be a URL type filename.
541sub filename_within_directory_url_format
542{
543 my ($filename,$within_dir) = @_;
544
545 # convert parameters only to / slashes if Windows
546
547 my $filename_urlformat = &filepath_to_url_format($filename);
548 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
549
550 #if ($within_dir_urlformat !~ m/\/$/) {
551 # make sure directory ends with a slash
552 #$within_dir_urlformat .= "/";
553 #}
554
555 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
556
557 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
558
559 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
560 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
561 $filename_urlformat = $1;
562 }
563
564 return $filename_urlformat;
565}
566
567# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
568# since on Linux it doesn't represent a file separator but an escape char).
569sub filepath_to_url_format
570{
571 my ($filepath) = @_;
572 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
573 # Only need to worry about Windows, as Unix style directories already in url-format
574 # Convert Windows style \ => /
575 $filepath =~ s@\\@/@g;
576 }
577 return $filepath;
578}
579
580# regex filepaths on windows may include \\ as path separator. Convert \\ to /
581sub filepath_regex_to_url_format
582{
583 my ($filepath) = @_;
584 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
585 # Only need to worry about Windows, as Unix style directories already in url-format
586 # Convert Windows style \\ => /
587 $filepath =~ s@\\\\@/@g;
588 }
589 return $filepath;
590
591}
592
593# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
594# and ignores trailing /
595# returns (file, dirs) dirs will be empty if no subdirs
596sub url_fileparse
597{
598 my ($filepath) = @_;
599 # remove trailing /
600 $filepath =~ s@/$@@;
601 if ($filepath !~ m@/@) {
602 return ($filepath, "");
603 }
604 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
605 return ($file, $dirs);
606
607}
608
609
610sub filename_within_collection
611{
612 my ($filename) = @_;
613
614 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
615
616 if (defined $collect_dir) {
617
618 # if from within GSDLCOLLECTDIR, then remove directory prefix
619 # so source_filename is realative to it. This is done to aid
620 # portability, i.e. the collection can be moved to somewhere
621 # else on the file system and the archives directory will still
622 # work. This is needed, for example in the applet version of
623 # GLI where GSDLHOME/collect on the server will be different to
624 # the collect directory of the remove user. Of course,
625 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
626 # it back into a full pathname.
627
628 $filename = filename_within_directory($filename,$collect_dir);
629 }
630
631 return $filename;
632}
633
634sub prettyprint_file
635{
636 my ($base_dir,$file,$gli) = @_;
637
638 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
639
640 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
641 require Win32;
642
643 # For some reason base_dir in the form c:/a/b/c
644 # This leads to confusion later on, so turn it back into
645 # the more usual Windows form
646 $base_dir =~ s/\//\\/g;
647 my $long_base_dir = Win32::GetLongPathName($base_dir);
648 my $long_full_path = Win32::GetLongPathName($filename_full_path);
649
650 $file = filename_within_directory($long_full_path,$long_base_dir);
651 $file = encode("utf8",$file) if ($gli);
652 }
653
654 return $file;
655}
656
657
658sub upgrade_if_dos_filename
659{
660 my ($filename_full_path,$and_encode) = @_;
661
662 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
663 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
664 # to its long (Windows) version
665 my $long_filename = Win32::GetLongPathName($filename_full_path);
666 if (defined $long_filename) {
667 $filename_full_path = $long_filename;
668 }
669 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
670 $filename_full_path =~ s/^(.):/\u$1:/;
671 if ((defined $and_encode) && ($and_encode)) {
672 $filename_full_path = encode("utf8",$filename_full_path);
673 }
674 }
675
676 return $filename_full_path;
677}
678
679
680sub downgrade_if_dos_filename
681{
682 my ($filename_full_path) = @_;
683
684 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
685 require Win32;
686
687 # Ensure the given long Windows filename is in a form that can
688 # be opened by Perl => convert it to a short DOS-like filename
689
690 my $short_filename = Win32::GetShortPathName($filename_full_path);
691 if (defined $short_filename) {
692 $filename_full_path = $short_filename;
693 }
694 # Make sure initial drive letter is lower-case (to fit in
695 # with rest of Greenstone)
696 $filename_full_path =~ s/^(.):/\u$1:/;
697 }
698
699 return $filename_full_path;
700}
701
702sub block_filename
703{
704 my ($block_hash,$filename) = @_;
705
706 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
707
708 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
709 my $lower_filename = lc($filename);
710 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
711# my $lower_drive = $filename;
712# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
713
714# my $upper_drive = $filename;
715# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
716#
717# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
718# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
719 }
720 else {
721 $block_hash->{'file_blocks'}->{$filename} = 1;
722 }
723}
724
725
726sub filename_is_absolute
727{
728 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
729 return &FileUtils::isFilenameAbsolute(@_);
730}
731
732
733## @method make_absolute()
734#
735# Ensure the given file path is absolute in respect to the given base path.
736#
737# @param $base_dir A string denoting the base path the given dir must be
738# absolute to.
739# @param $dir The directory to be made absolute as a string. Note that the
740# dir may already be absolute, in which case it will remain
741# unchanged.
742# @return The now absolute form of the directory as a string.
743#
744# @author John Thompson, DL Consulting Ltd.
745# @copy 2006 DL Consulting Ltd.
746#
747#used in buildcol.pl, doesn't work for all cases --kjdon
748sub make_absolute {
749
750 my ($base_dir, $dir) = @_;
751### print STDERR "dir = $dir\n";
752 $dir =~ s/[\\\/]+/\//g;
753 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
754 $dir =~ s|^/tmp_mnt||;
755 1 while($dir =~ s|/[^/]*/\.\./|/|g);
756 $dir =~ s|/[.][.]?/|/|g;
757 $dir =~ tr|/|/|s;
758### print STDERR "dir = $dir\n";
759
760 return $dir;
761}
762## make_absolute() ##
763
764sub get_dirsep {
765
766 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
767 return "\\";
768 } else {
769 return "\/";
770 }
771}
772
773sub get_os_dirsep {
774
775 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
776 return "\\\\";
777 } else {
778 return "\\\/";
779 }
780}
781
782sub get_re_dirsep {
783
784 return "\\\\|\\\/";
785}
786
787
788sub get_dirsep_tail {
789 my ($filename) = @_;
790
791 # returns last part of directory or filename
792 # On unix e.g. a/b.d => b.d
793 # a/b/c => c
794
795 my $dirsep = get_re_dirsep();
796 my @dirs = split (/$dirsep/, $filename);
797 my $tail = pop @dirs;
798
799 # - caused problems under windows
800 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
801
802 return $tail;
803}
804
805
806# if this is running on windows we want binaries to end in
807# .exe, otherwise they don't have to end in any extension
808sub get_os_exe {
809 return ".exe" if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"));
810 return "";
811}
812
813
814# test to see whether this is a big or little endian machine
815sub is_little_endian
816{
817 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
818 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
819 # Otherwise, it's little endian
820
821 #return 0 if $^O =~ /^darwin$/i;
822 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
823
824 # Going back to stating exactly whether the machine is little endian
825 # or big endian, without any special case for Macs. Since for rata it comes
826 # back with little endian and for shuttle with bigendian.
827 return (ord(substr(pack("s",1), 0, 1)) == 1);
828}
829
830
831# will return the collection name if successful, "" otherwise
832sub use_collection {
833 my ($collection, $collectdir) = @_;
834
835 if (!defined $collectdir || $collectdir eq "") {
836 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
837 }
838
839 if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
840 $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
841 }
842
843 # get and check the collection
844 if (!defined($collection) || $collection eq "") {
845 if (defined $ENV{'GSDLCOLLECTION'}) {
846 $collection = $ENV{'GSDLCOLLECTION'};
847 } else {
848 print STDOUT "No collection specified\n";
849 return "";
850 }
851 }
852
853 if ($collection eq "modelcol") {
854 print STDOUT "You can't use modelcol.\n";
855 return "";
856 }
857
858 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
859 # are defined
860 $ENV{'GSDLCOLLECTION'} = $collection;
861 $ENV{'GSDLCOLLECTHOME'} = $collectdir;
862 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
863
864 # make sure this collection exists
865 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
866 print STDOUT "Invalid collection ($collection).\n";
867 return "";
868 }
869
870 # everything is ready to go
871 return $collection;
872}
873
874sub get_current_collection_name {
875 return $ENV{'GSDLCOLLECTION'};
876}
877
878
879# will return the collection name if successful, "" otherwise.
880# Like use_collection (above) but for greenstone 3 (taking account of site level)
881
882sub use_site_collection {
883 my ($site, $collection, $collectdir) = @_;
884
885 if (!defined $collectdir || $collectdir eq "") {
886 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
887 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
888 }
889
890 if (defined $ENV{'GSDL3HOME'}) {
891 $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};
892 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
893 } elsif (defined $ENV{'GSDL3SRCHOME'}) {
894 $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web");
895 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
896 } else {
897 print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
898 }
899
900 # collectdir explicitly set by this point (using $site variable if required).
901 # Can call "old" gsdl2 use_collection now.
902
903 return use_collection($collection,$collectdir);
904}
905
906
907
908sub locate_config_file
909{
910 my ($file) = @_;
911
912 my $locations = locate_config_files($file);
913
914 return shift @$locations; # returns undef if 'locations' is empty
915}
916
917
918sub locate_config_files
919{
920 my ($file) = @_;
921
922 my @locations = ();
923
924 if (-e $file) {
925 # Clearly specified (most likely full filename)
926 # No need to hunt in 'etc' directories, return value unchanged
927 push(@locations,$file);
928 }
929 else {
930 # Check for collection specific one before looking in global GSDL 'etc'
931 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
932 my $test_collect_etc_filename
933 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
934
935 if (-e $test_collect_etc_filename) {
936 push(@locations,$test_collect_etc_filename);
937 }
938 }
939 my $test_main_etc_filename
940 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
941 if (-e $test_main_etc_filename) {
942 push(@locations,$test_main_etc_filename);
943 }
944 }
945
946 return \@locations;
947}
948
949
950sub hyperlink_text
951{
952 my ($text) = @_;
953
954 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
955 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
956
957 return $text;
958}
959
960
961# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
962# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
963sub is_dir_empty {
964 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
965 return &FileUtils::isDirectoryEmpty(@_);
966}
967
968# Returns the given filename converted using either URL encoding or base64
969# encoding, as specified by $rename_method. If the given filename has no suffix
970# (if it is just the tailname), then $no_suffix should be some defined value.
971# rename_method can be url, none, base64
972sub rename_file {
973 my ($filename, $rename_method, $no_suffix) = @_;
974
975 if(!$filename) { # undefined or empty string
976 return $filename;
977 }
978
979 if (!$rename_method) {
980 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
981 # Debugging information
982 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
983 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
984 $rename_method = "url";
985 } elsif($rename_method eq "none") {
986 return $filename; # would have already been renamed
987 }
988
989 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
990 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
991 ###$filename =~ s/ /_/g;
992
993 my ($tailname,$dirname,$suffix);
994 if($no_suffix) { # given a tailname, no suffix
995 ($tailname,$dirname) = File::Basename::fileparse($filename);
996 }
997 else {
998 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
999 }
1000 if (!$suffix) {
1001 $suffix = "";
1002 }
1003 # This breaks GLI matching extracted metadata to files in Enrich panel, as
1004 # original is eg .JPG while gsdlsourcefilename ends up .jpg
1005 # Not sure why it was done in first place...
1006 #else {
1007 #$suffix = lc($suffix);
1008 #}
1009
1010 if ($rename_method eq "url") {
1011 $tailname = &unicode::url_encode($tailname);
1012 }
1013 elsif ($rename_method eq "base64") {
1014 $tailname = &unicode::base64_encode($tailname);
1015 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1016 }
1017
1018 $filename = "$tailname$suffix";
1019 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1020
1021 return $filename;
1022}
1023
1024
1025# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1026sub rename_ldb_or_bdb_file {
1027 my ($filename_no_ext) = @_;
1028
1029 my $new_filename = "$filename_no_ext.gdb";
1030 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1031 # try ldb
1032 my $old_filename = "$filename_no_ext.ldb";
1033
1034 if (-f $old_filename) {
1035 print STDERR "Renaming $old_filename to $new_filename\n";
1036 rename ($old_filename, $new_filename)
1037 || print STDERR "Rename failed: $!\n";
1038 return;
1039 }
1040 # try bdb
1041 $old_filename = "$filename_no_ext.bdb";
1042 if (-f $old_filename) {
1043 print STDERR "Renaming $old_filename to $new_filename\n";
1044 rename ($old_filename, $new_filename)
1045 || print STDERR "Rename failed: $!\n";
1046 return;
1047 }
1048}
1049
1050sub os_dir() {
1051
1052 my $gsdlarch = "";
1053 if(defined $ENV{'GSDLARCH'}) {
1054 $gsdlarch = $ENV{'GSDLARCH'};
1055 }
1056 return $ENV{'GSDLOS'}.$gsdlarch;
1057}
1058
1059# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1060# By default, /greenstone3 for GS3 or /greenstone for GS2.
1061sub get_greenstone_url_prefix() {
1062 # if already set on a previous occasion, just return that
1063 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1064 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1065
1066 my ($configfile, $urlprefix, $defaultUrlprefix);
1067 my @propertynames = ();
1068
1069 if($ENV{'GSDL3SRCHOME'}) {
1070 $defaultUrlprefix = "/greenstone3";
1071 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1072 push(@propertynames, qw/path\s*\=/);
1073 } else {
1074 $defaultUrlprefix = "/greenstone";
1075 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1076 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1077 }
1078
1079 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1080
1081 if(!$urlprefix) { # no values found for URL prefix, use default values
1082 $urlprefix = $defaultUrlprefix;
1083 } else {
1084 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1085 $urlprefix =~ s/^\///; # remove the starting slash
1086 my @dirs = split(/(\\|\/)/, $urlprefix);
1087 $urlprefix = shift(@dirs);
1088
1089 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1090 $urlprefix = "/$urlprefix";
1091 }
1092 }
1093
1094 # set for the future
1095 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1096# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1097 return $urlprefix;
1098}
1099
1100
1101# Given a config file (xml or java properties file) and a list/array of regular expressions
1102# that represent property names to match on, this function will return the value for the 1st
1103# matching property name. If the return value is undefined, no matching property was found.
1104sub extract_propvalue_from_file() {
1105 my ($configfile, $propertynames) = @_;
1106
1107 my $value;
1108 unless(open(FIN, "<$configfile")) {
1109 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1110 return $value; # not initialised
1111 }
1112
1113 # Read the entire file at once, as one single line, then close it
1114 my $filecontents;
1115 {
1116 local $/ = undef;
1117 $filecontents = <FIN>;
1118 }
1119 close(FIN);
1120
1121 foreach my $regex (@$propertynames) {
1122 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1123 if($value) {
1124 $value =~ s/^\"//; # remove any startquotes
1125 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1126 last; # found value for a matching property, break from loop
1127 }
1128 }
1129
1130 return $value;
1131}
1132
1133# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1134# given that perllib is in @INC in order to invoke this subroutine.
1135# Call as follows -- after setting up INC to include perllib and
1136# after setting up GSDLHOME and GSDLOS:
1137#
1138# require util;
1139# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1140#
1141sub setup_greenstone_env() {
1142 my ($GSDLHOME, $GSDLOS) = @_;
1143
1144 #my %env_map = ();
1145 # Get the localised ENV settings of running a localised source setup.bash
1146 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1147 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1148 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1149 if (($GSDLOS =~ m/windows/i) && ($^O ne "cygwin")) {
1150 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1151 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1152 }
1153 if (!open(PIN, "$perl_command |")) {
1154 print STDERR ("Unable to execute command: $perl_command. $!\n");
1155 }
1156
1157 while (defined (my $perl_output_line = <PIN>)) {
1158 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1159 #$env_map{$key}=$value;
1160 $ENV{$key}=$value;
1161 }
1162 close (PIN);
1163
1164 # If any keys in $ENV don't occur in Greenstone's localised env
1165 # (stored in $env_map), delete those entries from $ENV
1166 #foreach $key (keys %ENV) {
1167 # if(!defined $env_map{$key}) {
1168 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1169 # delete $ENV{$key}; # del $ENV(key, value) pair
1170 # }
1171 #}
1172 #undef %env_map;
1173}
1174
1175sub get_perl_exec() {
1176 my $perl_exec = $^X; # may return just "perl"
1177
1178 if($ENV{'PERLPATH'}) {
1179 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1180 if (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin")) {
1181 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1182 } else {
1183 $perl_exec = "$ENV{'PERLPATH'}/perl";
1184 }
1185 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1186 # containing the full path to the current perl executable we're using
1187 $perl_exec = $Config{perlpath}; # configured path for perl
1188 if (!-e $perl_exec) { # may not point to location on this machine
1189 $perl_exec = $^X; # may return just "perl"
1190 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1191 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1192 }
1193 }
1194 }
1195
1196 return $perl_exec;
1197}
1198
1199# returns the path to the java command in the JRE included with GS (if any),
1200# quoted to safeguard any spaces in this path, otherwise a simple java
1201# command is returned which assumes and will try for a system java.
1202sub get_java_command {
1203 my $java = "java";
1204 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1205 # after running setup.bat or from GLI which also runs setup.bat
1206 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1207 if(-d $java_bin) {
1208 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1209 $java = "\"".$java."\""; # quoted to preserve spaces in path
1210 }
1211 }
1212 return $java;
1213}
1214
1215
1216# Given the qualified collection name (colgroup/collection),
1217# returns the collection and colgroup parts
1218sub get_collection_parts {
1219 # http://perldoc.perl.org/File/Basename.html
1220 # my($filename, $directories, $suffix) = fileparse($path);
1221 # "$directories contains everything up to and including the last directory separator in the $path
1222 # including the volume (if applicable). The remainder of the $path is the $filename."
1223 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1224
1225 my $qualified_collection = shift(@_);
1226
1227 # Since activate.pl can be launched from the command-line, including by a user,
1228 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1229 # Also allow for the accidental inclusion of multiple slashes
1230 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1231
1232 if(!defined $collection) {
1233 $collection = $colgroup;
1234 $colgroup = "";
1235 }
1236 return ($collection, $colgroup);
1237}
1238
1239# work out the "collectdir/collection" location
1240sub resolve_collection_dir {
1241 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1242
1243 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1244 return $ENV{'GSDLCOLLECTDIR'};
1245 }
1246
1247 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1248
1249 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1250 $collect_dir = &util::get_working_collect_dir($site);
1251 }
1252
1253 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1254}
1255
1256# work out the full path to "collect" of this greenstone 2/3 installation
1257sub get_working_collect_dir {
1258 my ($site) = @_;
1259
1260 if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1261 return $ENV{'GSDLCOLLECTHOME'};
1262 }
1263
1264 if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1265
1266 if (defined $ENV{'GSDL3HOME'}) {
1267 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1268 }
1269 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1270 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1271 }
1272 }
1273
1274 elsif (defined $ENV{'SITEHOME'}) {
1275 return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1276 }
1277
1278 else { # get default collect dir for GS2
1279 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1280 }
1281}
1282
1283sub is_abs_path_any_os {
1284 my ($path) = @_;
1285
1286 # We can have filenames in our DBs that were produced on other OS, so this method exists
1287 # to help identify absolute paths in such cases.
1288
1289 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1290 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1291
1292 return 0;
1293}
1294
1295
1296# This subroutine is for improving portability of Greenstone collections from one OS to another,
1297# to be used to convert absolute paths going into db files into paths with placeholders instead.
1298# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1299# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1300# path with a placeholder to match.
1301# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1302# the path to the current (specific) collection, the path to the general GS collect directory,
1303# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1304# The longest matching prefix will be replaced with the equivalent placeholder:
1305# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1306sub abspath_to_placeholders {
1307 my $path = shift(@_); # path to convert from absolute to one with placeholders
1308 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1309
1310 return $path unless is_abs_path_any_os($path); # path is relative
1311
1312 if ($opt_long_or_short_winfilenames eq "long") {
1313 $path = &util::upgrade_if_dos_filename($path); # will only do something on windows
1314 }
1315
1316 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1317 my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $ENV{'SITEHOME'}, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1318
1319 my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1320 $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1321 $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1322 );
1323 $placeholder_map{$ENV{'SITEHOME'}} = '@SITEHOME@' if defined $ENV{'SITEHOME'};
1324
1325 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1326
1327 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1328 # for windows need to look for matches on short file names too
1329 # matched paths are again to be replaced with the usual placeholders
1330
1331 my $gsdlcollectdir = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'});
1332 my $gsdlcollecthome = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'});
1333 my $sitehome = (defined $ENV{'SITEHOME'}) ? &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) : undef;
1334 my $greenstonehome = &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'});
1335
1336 @gs_paths = ($gsdlcollectdir, $gsdlcollecthome, $sitehome, $greenstonehome); # order matters
1337
1338 %placeholder_map = ($greenstonehome => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1339 $gsdlcollecthome => '@COLLECTHOME@',
1340 $gsdlcollectdir => '@THISCOLLECTPATH@'
1341 );
1342 $placeholder_map{$sitehome} = '@SITEHOME@' if defined $sitehome;
1343
1344 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1345 }
1346
1347 return $path;
1348}
1349
1350sub _abspath_to_placeholders {
1351 my ($path, $gs_paths_ref, $placeholder_map_ref) = @_;
1352
1353 # The sequence of elements in @gs_paths matters
1354 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1355 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1356
1357 foreach my $gs_path (@$gs_paths_ref) {
1358 next if(!defined $gs_path); # site undefined for GS2
1359
1360 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1361
1362 if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1363
1364 my $placeholder = $placeholder_map_ref->{$gs_path}; # get the placeholder to replace the matched path with
1365
1366 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1367 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1368 last; # done
1369 }
1370 }
1371
1372 return $path;
1373}
1374
1375# Function that does the reverse of the util::abspath_to_placeholders subroutine
1376# Once again, call this with the values returned from util::get_common_gs_paths
1377sub placeholders_to_abspath {
1378 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1379 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1380
1381 return $path if($path !~ m/@/); # path contains no placeholders
1382
1383 # replace placeholders with gs prefixes
1384 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1385 # but listed here from longest to shortest once placeholders are have been resolved
1386
1387 # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1388 my %placeholder_to_gspath_map;
1389 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1390 # always replace placeholders with short file names of the absolute paths on windows?
1391 %placeholder_to_gspath_map = ('@GSDLHOME@' => &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'}),
1392 '@COLLECTHOME@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'}),
1393 '@THISCOLLECTPATH@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'})
1394 );
1395 $placeholder_to_gspath_map{'@SITEHOME@'} = &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) if defined $ENV{'SITEHOME'};
1396 } else {
1397 %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'},
1398 '@SITEHOME@' => $ENV{'SITEHOME'}, # can be undef
1399 '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1400 '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1401 ); # $placeholder_to_gspath_map{'@SITEHOME@'} = $ENV{'SITEHOME'} if defined $ENV{'SITEHOME'};
1402 }
1403
1404 foreach my $placeholder (@placeholders) {
1405 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1406
1407 next if(!defined $gs_path); # sitehome for GS2 is undefined
1408
1409 if($path =~ m/^$placeholder/) {
1410 $path =~ s/^$placeholder/$gs_path/;
1411 last; # done
1412 }
1413 }
1414
1415 return $path;
1416}
1417
1418# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1419# a directory containing sequentially numbered images.
1420sub create_itemfile
1421{
1422 my ($output_dir, $convert_basename, $convert_to) = @_;
1423 my $page_num = "";
1424
1425 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1426 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1427 closedir DIR;
1428
1429 # Sort files in the directory by page_num
1430 sub page_number {
1431 my ($dir) = @_;
1432 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1433# my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1434
1435 $pagenum = 1 unless defined $pagenum;
1436 return $pagenum;
1437 }
1438
1439 # sort the files in the directory in the order of page_num rather than lexically.
1440 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1441
1442 # work out if the numbering of the now sorted image files starts at 0 or not
1443 # by checking the number of the first _image_ file (skipping item files)
1444 my $starts_at_0 = 0;
1445 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1446 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1447 $starts_at_0 = 1;
1448 }
1449
1450 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1451 my $item_fh;
1452 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1453 print $item_fh "<PagedDocument>\n";
1454
1455 foreach my $file (@dir_files){
1456 if ($file !~ /\.item/i){
1457 $page_num = page_number($file);
1458 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1459 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1460 }
1461 }
1462
1463 print $item_fh "</PagedDocument>\n";
1464 &FileUtils::closeFileHandle($item_file, \$item_fh);
1465 return $item_file;
1466}
1467
1468
1469## @function augmentINC()
1470#
1471# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1472# @param $new_path The path to add
1473# @author jmt12
1474#
1475sub augmentINC
1476{
1477 my ($new_path) = @_;
1478 my $did_add_path = 0;
1479 # might need to be replaced with FileUtils::directoryExists() call eventually
1480 if (-d $new_path)
1481 {
1482 my $did_find_path = 0;
1483 foreach my $existing_path (@INC)
1484 {
1485 if ($existing_path eq $new_path)
1486 {
1487 $did_find_path = 1;
1488 last;
1489 }
1490 }
1491 if (!$did_find_path)
1492 {
1493 unshift(@INC, $new_path);
1494 $did_add_path = 1;
1495 }
1496 }
1497 return $did_add_path;
1498}
1499## augmentINC()
1500
1501
15021;
Note: See TracBrowser for help on using the repository browser.