source: main/trunk/greenstone2/perllib/util.pm@ 28604

Last change on this file since 28604 was 28604, checked in by ak19, 10 years ago

Found some issues when wanting to add in the CDS-ISIS tutorial collection into the model-collections now that 64 bit linux can handled ISIS collections. Windows carriage return linefeed needs to be processed the same way as linux linefeed, else on Windows the ISIS file will not get parsed the same way and also result in extra newline characters, which shows up when doing diffcol.

  • Property svn:keywords set to Author Date Id Revision
File size: 58.6 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29use FileUtils;
30
31use Encode;
32use File::Copy;
33use File::Basename;
34# Config for getting the perlpath in the recommended way, though it uses paths that are
35# hard-coded into the Config file that's generated upon configuring and compiling perl.
36# $^X works better in some cases to return the path to perl used to launch the script,
37# but if launched with plain "perl" (no full-path), that will be just what it returns.
38use Config;
39# New module for file related utility functions - intended as a
40# placeholder for an extension that allows a variety of different
41# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
42use FileUtils;
43
44if ($ENV{'GSDLOS'} =~ /^windows$/i) {
45 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
46}
47
48# removes files (but not directories)
49sub rm {
50 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
51 return &FileUtils::removeFiles(@_);
52}
53
54# recursive removal
55sub filtered_rm_r {
56 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
57 return &FileUtils::removeFilesFiltered(@_);
58}
59
60# recursive removal
61sub rm_r {
62 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
63 return &FileUtils::removeFilesRecursive(@_);
64}
65
66# moves a file or a group of files
67sub mv {
68 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
69 return &FileUtils::moveFiles(@_);
70}
71
72# Move the contents of source directory into target directory
73# (as opposed to merely replacing target dir with the src dir)
74# This can overwrite any files with duplicate names in the target
75# but other files and folders in the target will continue to exist
76sub mv_dir_contents {
77 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
78 return &FileUtils::moveDirectoryContents(@_);
79}
80
81# copies a file or a group of files
82sub cp {
83 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
84 return &FileUtils::copyFiles(@_);
85}
86
87# recursively copies a file or group of files
88# syntax: cp_r (sourcefiles, destination directory)
89# destination must be a directory - to copy one file to
90# another use cp instead
91sub cp_r {
92 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
93 return &FileUtils::copyFilesRecursive(@_);
94}
95
96# recursively copies a file or group of files
97# syntax: cp_r (sourcefiles, destination directory)
98# destination must be a directory - to copy one file to
99# another use cp instead
100sub cp_r_nosvn {
101 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
102 return &FileUtils::copyFilesRecursiveNoSVN(@_);
103}
104
105# copies a directory and its contents, excluding subdirectories, into a new directory
106sub cp_r_toplevel {
107 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
108 return &FileUtils::recursiveCopyTopLevel(@_);
109}
110
111sub mk_dir {
112 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
113 return &FileUtils::makeDirectory(@_);
114}
115
116# in case anyone cares - I did some testing (using perls Benchmark module)
117# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
118# slightly faster (surprisingly) - Stefan.
119sub mk_all_dir {
120 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
121 return &FileUtils::makeAllDirectories(@_);
122}
123
124# make hard link to file if supported by OS, otherwise copy the file
125sub hard_link {
126 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
127 return &FileUtils::hardLink(@_);
128}
129
130# make soft link to file if supported by OS, otherwise copy file
131sub soft_link {
132 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
133 return &FileUtils::softLink(@_);
134}
135
136# Primarily for filenames generated by processing
137# content of HTML files (which are mapped to UTF-8 internally)
138#
139# To turn this into an octet string that really exists on the file
140# system:
141# 1. don't need to do anything special for Unix-based systems
142# (as underlying file system is byte-code)
143# 2. need to map to short DOS filenames for Windows
144
145sub utf8_to_real_filename
146{
147 my ($utf8_filename) = @_;
148
149 my $real_filename;
150
151 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
152 require Win32;
153
154 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
155
156 my $unicode_filename = decode("utf8",$utf8_filename);
157 $real_filename = Win32::GetShortPathName($unicode_filename);
158 }
159 else {
160 $real_filename = $utf8_filename;
161 }
162
163 return $real_filename;
164}
165
166sub fd_exists {
167 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
168 return &FileUtils::fileTest(@_);
169}
170
171sub file_exists {
172 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
173 return &FileUtils::fileExists(@_);
174}
175
176sub dir_exists {
177 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
178 return &FileUtils::directoryExists(@_);
179}
180
181# updates a copy of a directory in some other part of the filesystem
182# verbosity settings are: 0=low, 1=normal, 2=high
183# both $fromdir and $todir should be absolute paths
184sub cachedir {
185 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
186 return &FileUtils::synchronizeDirectories(@_);
187}
188
189# this function returns -1 if either file is not found
190# assumes that $file1 and $file2 are absolute file names or
191# in the current directory
192# $file2 is allowed to be newer than $file1
193sub differentfiles {
194 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
195 return &FileUtils::differentFiles(@_);
196}
197
198
199sub get_tmp_filename
200{
201 my $file_ext = shift(@_) || undef;
202
203 my $opt_dot_file_ext = "";
204 if (defined $file_ext) {
205 if ($file_ext !~ m/\./) {
206 # no dot, so needs one added in at start
207 $opt_dot_file_ext = ".$file_ext"
208 }
209 else {
210 # allow for "extensions" such as _metadata.txt to be handled
211 # gracefully
212 $opt_dot_file_ext = $file_ext;
213 }
214 }
215
216 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
217 &FileUtils::makeAllDirectories ($tmpdir) unless -e $tmpdir;
218
219 my $count = 1000;
220 my $rand = int(rand $count);
221 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
222
223 while (-e $full_tmp_filename) {
224 $rand = int(rand $count);
225 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
226 $count++;
227 }
228
229 return $full_tmp_filename;
230}
231
232# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
233# belong with that function. They help ensure the timestamped tmp folders generated are unique.
234my $previous_timestamp = undef;
235my $previous_timestamp_f = 0; # frequency
236
237sub get_timestamped_tmp_folder
238{
239
240 my $tmp_dirname;
241 if(defined $ENV{'GSDLCOLLECTDIR'}) {
242 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
243 } elsif(defined $ENV{'GSDLHOME'}) {
244 $tmp_dirname = $ENV{'GSDLHOME'};
245 } else {
246 return undef;
247 }
248
249 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
250 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
251
252 # add the timestamp into the path otherwise we can run into problems
253 # if documents have the same name
254 my $timestamp = time;
255
256 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
257 $previous_timestamp_f = 0;
258 $previous_timestamp = $timestamp;
259 } else {
260 $previous_timestamp_f++;
261 }
262
263 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
264 $tmp_dirname = $time_tmp_dirname;
265 my $i = $previous_timestamp_f;
266
267 if($previous_timestamp_f > 0) {
268 $tmp_dirname = $time_tmp_dirname."_".$i;
269 $i++;
270 }
271 while (-e $tmp_dirname) {
272 $tmp_dirname = $time_tmp_dirname."_".$i;
273 $i++;
274 }
275 &FileUtils::makeDirectory($tmp_dirname);
276
277 return $tmp_dirname;
278}
279
280sub get_timestamped_tmp_filename_in_collection
281{
282
283 my ($input_filename, $output_ext) = @_;
284 # derive tmp filename from input filename
285 my ($tailname, $dirname, $suffix)
286 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
287
288 # softlink to collection tmp dir
289 my $tmp_dirname = &util::get_timestamped_tmp_folder();
290 $tmp_dirname = $dirname unless defined $tmp_dirname;
291
292 # following two steps copied from ConvertBinaryFile
293 # do we need them?? can't use them as is, as they use plugin methods.
294
295 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
296
297 # URLEncode this since htmls with images where the html filename is utf8 don't seem
298 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
299 # files on the filesystem.
300 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
301 if (defined $output_ext) {
302 $output_ext = ".$output_ext"; # add the dot
303 } else {
304 $output_ext = $suffix;
305 }
306 $output_ext= lc($output_ext);
307 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
308
309 return $tmp_filename;
310}
311
312sub get_toplevel_tmp_dir
313{
314 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
315}
316
317
318sub get_collectlevel_tmp_dir
319{
320 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
321 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
322
323 return $tmp_dirname;
324}
325
326
327sub filename_to_regex {
328 my $filename = shift (@_);
329
330 # need to make single backslashes double so that regex works
331 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
332
333 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
334 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
335 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
336 $filename =~ s@\(@\\(@g; # escape brackets
337 $filename =~ s@\)@\\)@g; # escape brackets
338 $filename =~ s@\[@\\[@g; # escape brackets
339 $filename =~ s@\]@\\]@g; # escape brackets
340
341 return $filename;
342}
343
344sub unregex_filename {
345 my $filename = shift (@_);
346
347 # need to put doubled backslashes for regex back to single
348 $filename =~ s/\\\./\./g; # remove RE syntax for .
349 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
350 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
351 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
352 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
353
354 # \\ goes to \
355 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
356 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
357 return $filename;
358}
359
360sub filename_cat {
361 # I've disabled this warning for now, as every Greenstone perl
362 # script seems to make use of this function and so you drown in a
363 # sea of deprecated warnings [jmt12]
364# warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
365 return &FileUtils::filenameConcatenate(@_);
366}
367
368
369sub _pathname_cat {
370 my $join_char = shift(@_);
371 my $first_path = shift(@_);
372 my (@pathnames) = @_;
373
374 # If first_path is not null or empty, then add it back into the list
375 if (defined $first_path && $first_path =~ /\S/) {
376 unshift(@pathnames, $first_path);
377 }
378
379 my $pathname = join($join_char, @pathnames);
380
381 # remove duplicate slashes
382 if ($join_char eq ";") {
383 $pathname =~ s/[\\\/]+/\\/g;
384 if ($^O eq "cygwin") {
385 # Once we've collapsed muliple (potentialy > 2) slashes
386 # For cygwin, actually want things double-backslahed
387 $pathname =~ s/\\/\\\\/g;
388 }
389
390 } else {
391 $pathname =~ s/[\/]+/\//g;
392 # DB: want a pathname abc\de.html to remain like this
393 }
394
395 return $pathname;
396}
397
398
399sub pathname_cat {
400 my (@pathnames) = @_;
401
402 my $join_char;
403 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
404 $join_char = ";";
405 } else {
406 $join_char = ":";
407 }
408 return _pathname_cat($join_char,@pathnames);
409}
410
411
412sub javapathname_cat {
413 my (@pathnames) = @_;
414
415 my $join_char;
416
417 # Unlike pathname_cat() above, not interested if running in a Cygwin environment
418 # This is because the java we run is actually a native Windows executable
419
420 if (($ENV{'GSDLOS'} =~ /^windows$/i)) {
421 $join_char = ";";
422 } else {
423 $join_char = ":";
424 }
425 return _pathname_cat($join_char,@pathnames);
426}
427
428
429sub makeFilenameJavaCygwinCompatible
430{
431 my ($java_filename) = @_;
432
433 if ($^O eq "cygwin") {
434 # To be used with a Java program, but under Cygwin
435 # Because the java binary that is native to Windows, need to
436 # convert the Cygwin paths (i.e. Unix style) to be Windows
437 # compatible
438
439 $java_filename = `cygpath -wp "$java_filename"`;
440 chomp($java_filename);
441 $java_filename =~ s%\\%\\\\%g;
442 }
443
444 return $java_filename;
445}
446
447sub tidy_up_oid {
448 my ($OID) = @_;
449 if ($OID =~ /\./) {
450 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
451 $OID =~ s/\.//g; #remove any periods
452 }
453 if ($OID =~ /^\s.*\s$/) {
454 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
455 # remove starting and trailing whitespace
456 $OID =~ s/^\s+//;
457 $OID =~ s/\s+$//;
458 }
459 if ($OID =~ /^[\d]*$/) {
460 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
461 $OID = "D" . $OID;
462 }
463
464 return $OID;
465}
466
467sub envvar_prepend {
468 my ($var,$val) = @_;
469
470 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
471## my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
472
473 # Rewritten above to make ":" the default (Windows is the special
474 # case, anything else 'unusual' such as Solaris etc is Unix)
475 my $pathsep = (defined $ENV{'GSDLOS'} && (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin"))) ? ";" : ":";
476
477 # do not prepend any value/path that's already in the environment variable
478
479 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
480 if (!defined($ENV{$var})) {
481 $ENV{$var} = "$val";
482 }
483 elsif($ENV{$var} !~ m/$escaped_val/) {
484 $ENV{$var} = "$val".$pathsep.$ENV{$var};
485 }
486}
487
488sub envvar_append {
489 my ($var,$val) = @_;
490
491 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
492 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
493
494 # do not append any value/path that's already in the environment variable
495
496 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
497 if (!defined($ENV{$var})) {
498 $ENV{$var} = "$val";
499 }
500 elsif($ENV{$var} !~ m/$escaped_val/) {
501 $ENV{$var} = $ENV{$var}.$pathsep."$val";
502 }
503}
504
505
506
507# splits a filename into a prefix and a tail extension using the tail_re, or
508# if that fails, splits on the file_extension . (dot)
509sub get_prefix_and_tail_by_regex {
510
511 my ($filename,$tail_re) = @_;
512
513 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
514 if ((!defined $file_prefix) || (!defined $file_ext)) {
515 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
516 }
517
518 return ($file_prefix,$file_ext);
519}
520
521# get full path and file only path from a base_dir (which may be empty) and
522# file (which may contain directories)
523sub get_full_filenames {
524 my ($base_dir, $file) = @_;
525
526# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
527# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
528# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
529
530
531 my $filename_full_path = $file;
532 # add on directory if present
533 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
534
535 my $filename_no_path = $file;
536
537 # remove directory if present
538 $filename_no_path =~ s/^.*[\/\\]//;
539 return ($filename_full_path, $filename_no_path);
540}
541
542# returns the path of a file without the filename -- ie. the directory the file is in
543sub filename_head {
544 my $filename = shift(@_);
545
546 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
547 $filename =~ s/[^\\\\]*$//;
548 }
549 else {
550 $filename =~ s/[^\\\/]*$//;
551 }
552
553 return $filename;
554}
555
556
557
558# returns 1 if filename1 and filename2 point to the same
559# file or directory
560sub filenames_equal {
561 my ($filename1, $filename2) = @_;
562
563 # use filename_cat to clean up trailing slashes and
564 # multiple slashes
565 $filename1 = &FileUtils::filenameConcatenate($filename1);
566 $filename2 = &FileUtils::filenameConcatenate($filename2);
567
568 # filenames not case sensitive on windows
569 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
570 $filename1 =~ tr/[A-Z]/[a-z]/;
571 $filename2 =~ tr/[A-Z]/[a-z]/;
572 }
573 return 1 if $filename1 eq $filename2;
574 return 0;
575}
576
577# If filename is relative to within_dir, returns the relative path of filename to that directory
578# with slashes in the filename returned as they were in the original (absolute) filename.
579sub filename_within_directory
580{
581 my ($filename,$within_dir) = @_;
582
583 if ($within_dir !~ m/[\/\\]$/) {
584 my $dirsep = &util::get_dirsep();
585 $within_dir .= $dirsep;
586 }
587
588 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
589 if ($filename =~ m/^$within_dir(.*)$/) {
590 $filename = $1;
591 }
592
593 return $filename;
594}
595
596# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
597# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
598# The subpath returned will also be a URL type filename.
599sub filename_within_directory_url_format
600{
601 my ($filename,$within_dir) = @_;
602
603 # convert parameters only to / slashes if Windows
604
605 my $filename_urlformat = &filepath_to_url_format($filename);
606 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
607
608 #if ($within_dir_urlformat !~ m/\/$/) {
609 # make sure directory ends with a slash
610 #$within_dir_urlformat .= "/";
611 #}
612
613 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
614
615 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
616
617 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
618 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
619 $filename_urlformat = $1;
620 }
621
622 return $filename_urlformat;
623}
624
625# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
626# since on Linux it doesn't represent a file separator but an escape char).
627sub filepath_to_url_format
628{
629 my ($filepath) = @_;
630 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
631 # Only need to worry about Windows, as Unix style directories already in url-format
632 # Convert Windows style \ => /
633 $filepath =~ s@\\@/@g;
634 }
635 return $filepath;
636}
637
638# regex filepaths on windows may include \\ as path separator. Convert \\ to /
639sub filepath_regex_to_url_format
640{
641 my ($filepath) = @_;
642 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
643 # Only need to worry about Windows, as Unix style directories already in url-format
644 # Convert Windows style \\ => /
645 $filepath =~ s@\\\\@/@g;
646 }
647 return $filepath;
648
649}
650
651# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
652# and ignores trailing /
653# returns (file, dirs) dirs will be empty if no subdirs
654sub url_fileparse
655{
656 my ($filepath) = @_;
657 # remove trailing /
658 $filepath =~ s@/$@@;
659 if ($filepath !~ m@/@) {
660 return ($filepath, "");
661 }
662 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
663 return ($file, $dirs);
664
665}
666
667
668sub filename_within_collection
669{
670 my ($filename) = @_;
671
672 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
673
674 if (defined $collect_dir) {
675
676 # if from within GSDLCOLLECTDIR, then remove directory prefix
677 # so source_filename is realative to it. This is done to aid
678 # portability, i.e. the collection can be moved to somewhere
679 # else on the file system and the archives directory will still
680 # work. This is needed, for example in the applet version of
681 # GLI where GSDLHOME/collect on the server will be different to
682 # the collect directory of the remove user. Of course,
683 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
684 # it back into a full pathname.
685
686 $filename = filename_within_directory($filename,$collect_dir);
687 }
688
689 return $filename;
690}
691
692sub prettyprint_file
693{
694 my ($base_dir,$file,$gli) = @_;
695
696 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
697
698 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
699 require Win32;
700
701 # For some reason base_dir in the form c:/a/b/c
702 # This leads to confusion later on, so turn it back into
703 # the more usual Windows form
704 $base_dir =~ s/\//\\/g;
705 my $long_base_dir = Win32::GetLongPathName($base_dir);
706 my $long_full_path = Win32::GetLongPathName($filename_full_path);
707
708 $file = filename_within_directory($long_full_path,$long_base_dir);
709 $file = encode("utf8",$file) if ($gli);
710 }
711
712 return $file;
713}
714
715
716sub upgrade_if_dos_filename
717{
718 my ($filename_full_path,$and_encode) = @_;
719
720 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
721 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
722 # to its long (Windows) version
723 my $long_filename = Win32::GetLongPathName($filename_full_path);
724 if (defined $long_filename) {
725 $filename_full_path = $long_filename;
726 }
727 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
728 $filename_full_path =~ s/^(.):/\u$1:/;
729 if ((defined $and_encode) && ($and_encode)) {
730 $filename_full_path = encode("utf8",$filename_full_path);
731 }
732 }
733
734 return $filename_full_path;
735}
736
737
738sub downgrade_if_dos_filename
739{
740 my ($filename_full_path) = @_;
741
742 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
743 require Win32;
744
745 # Ensure the given long Windows filename is in a form that can
746 # be opened by Perl => convert it to a short DOS-like filename
747
748 my $short_filename = Win32::GetShortPathName($filename_full_path);
749 if (defined $short_filename) {
750 $filename_full_path = $short_filename;
751 }
752 # Make sure initial drive letter is lower-case (to fit in
753 # with rest of Greenstone)
754 $filename_full_path =~ s/^(.):/\u$1:/;
755 }
756
757 return $filename_full_path;
758}
759
760sub block_filename
761{
762 my ($block_hash,$filename) = @_;
763
764 if (($ENV{'GSDLOS'} =~ m/^windows$/) && ($^O ne "cygwin")) {
765
766 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
767 my $lower_filename = lc($filename);
768 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
769# my $lower_drive = $filename;
770# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
771
772# my $upper_drive = $filename;
773# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
774#
775# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
776# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
777 }
778 else {
779 $block_hash->{'file_blocks'}->{$filename} = 1;
780 }
781}
782
783
784sub filename_is_absolute
785{
786 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
787 return &FileUtils::isFilenameAbsolute(@_);
788}
789
790
791## @method make_absolute()
792#
793# Ensure the given file path is absolute in respect to the given base path.
794#
795# @param $base_dir A string denoting the base path the given dir must be
796# absolute to.
797# @param $dir The directory to be made absolute as a string. Note that the
798# dir may already be absolute, in which case it will remain
799# unchanged.
800# @return The now absolute form of the directory as a string.
801#
802# @author John Thompson, DL Consulting Ltd.
803# @copy 2006 DL Consulting Ltd.
804#
805#used in buildcol.pl, doesn't work for all cases --kjdon
806sub make_absolute {
807
808 my ($base_dir, $dir) = @_;
809### print STDERR "dir = $dir\n";
810 $dir =~ s/[\\\/]+/\//g;
811 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
812 $dir =~ s|^/tmp_mnt||;
813 1 while($dir =~ s|/[^/]*/\.\./|/|g);
814 $dir =~ s|/[.][.]?/|/|g;
815 $dir =~ tr|/|/|s;
816### print STDERR "dir = $dir\n";
817
818 return $dir;
819}
820## make_absolute() ##
821
822sub get_dirsep {
823
824 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
825 return "\\";
826 } else {
827 return "\/";
828 }
829}
830
831sub get_os_dirsep {
832
833 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
834 return "\\\\";
835 } else {
836 return "\\\/";
837 }
838}
839
840sub get_re_dirsep {
841
842 return "\\\\|\\\/";
843}
844
845
846sub get_dirsep_tail {
847 my ($filename) = @_;
848
849 # returns last part of directory or filename
850 # On unix e.g. a/b.d => b.d
851 # a/b/c => c
852
853 my $dirsep = get_re_dirsep();
854 my @dirs = split (/$dirsep/, $filename);
855 my $tail = pop @dirs;
856
857 # - caused problems under windows
858 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
859
860 return $tail;
861}
862
863
864# if this is running on windows we want binaries to end in
865# .exe, otherwise they don't have to end in any extension
866sub get_os_exe {
867 return ".exe" if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"));
868 return "";
869}
870
871
872# test to see whether this is a big or little endian machine
873sub is_little_endian
874{
875 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
876 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
877 # Otherwise, it's little endian
878
879 #return 0 if $^O =~ /^darwin$/i;
880 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
881
882 # Going back to stating exactly whether the machine is little endian
883 # or big endian, without any special case for Macs. Since for rata it comes
884 # back with little endian and for shuttle with bigendian.
885 return (ord(substr(pack("s",1), 0, 1)) == 1);
886}
887
888
889# will return the collection name if successful, "" otherwise
890sub use_collection {
891 my ($collection, $collectdir) = @_;
892
893 if (!defined $collectdir || $collectdir eq "") {
894 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
895 }
896
897 if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
898 $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
899 }
900
901 # get and check the collection
902 if (!defined($collection) || $collection eq "") {
903 if (defined $ENV{'GSDLCOLLECTION'}) {
904 $collection = $ENV{'GSDLCOLLECTION'};
905 } else {
906 print STDOUT "No collection specified\n";
907 return "";
908 }
909 }
910
911 if ($collection eq "modelcol") {
912 print STDOUT "You can't use modelcol.\n";
913 return "";
914 }
915
916 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
917 # are defined
918 $ENV{'GSDLCOLLECTION'} = $collection;
919 $ENV{'GSDLCOLLECTHOME'} = $collectdir;
920 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
921
922 # make sure this collection exists
923 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
924 print STDOUT "Invalid collection ($collection).\n";
925 return "";
926 }
927
928 # everything is ready to go
929 return $collection;
930}
931
932sub get_current_collection_name {
933 return $ENV{'GSDLCOLLECTION'};
934}
935
936
937# will return the collection name if successful, "" otherwise.
938# Like use_collection (above) but for greenstone 3 (taking account of site level)
939
940sub use_site_collection {
941 my ($site, $collection, $collectdir) = @_;
942
943 if (!defined $collectdir || $collectdir eq "") {
944 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
945 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
946 }
947
948 if (defined $ENV{'GSDL3HOME'}) {
949 $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};
950 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
951 } elsif (defined $ENV{'GSDL3SRCHOME'}) {
952 $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web");
953 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
954 } else {
955 print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
956 }
957
958 # collectdir explicitly set by this point (using $site variable if required).
959 # Can call "old" gsdl2 use_collection now.
960
961 return use_collection($collection,$collectdir);
962}
963
964
965
966sub locate_config_file
967{
968 my ($file) = @_;
969
970 my $locations = locate_config_files($file);
971
972 return shift @$locations; # returns undef if 'locations' is empty
973}
974
975
976sub locate_config_files
977{
978 my ($file) = @_;
979
980 my @locations = ();
981
982 if (-e $file) {
983 # Clearly specified (most likely full filename)
984 # No need to hunt in 'etc' directories, return value unchanged
985 push(@locations,$file);
986 }
987 else {
988 # Check for collection specific one before looking in global GSDL 'etc'
989 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
990 my $test_collect_etc_filename
991 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
992
993 if (-e $test_collect_etc_filename) {
994 push(@locations,$test_collect_etc_filename);
995 }
996 }
997 my $test_main_etc_filename
998 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
999 if (-e $test_main_etc_filename) {
1000 push(@locations,$test_main_etc_filename);
1001 }
1002 }
1003
1004 return \@locations;
1005}
1006
1007
1008sub hyperlink_text
1009{
1010 my ($text) = @_;
1011
1012 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1013 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1014
1015 return $text;
1016}
1017
1018
1019# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1020# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1021sub is_dir_empty {
1022 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
1023 return &FileUtils::isDirectoryEmpty(@_);
1024}
1025
1026# Returns the given filename converted using either URL encoding or base64
1027# encoding, as specified by $rename_method. If the given filename has no suffix
1028# (if it is just the tailname), then $no_suffix should be some defined value.
1029# rename_method can be url, none, base64
1030sub rename_file {
1031 my ($filename, $rename_method, $no_suffix) = @_;
1032
1033 if(!$filename) { # undefined or empty string
1034 return $filename;
1035 }
1036
1037 if (!$rename_method) {
1038 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1039 # Debugging information
1040 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1041 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1042 $rename_method = "url";
1043 } elsif($rename_method eq "none") {
1044 return $filename; # would have already been renamed
1045 }
1046
1047 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1048 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1049 ###$filename =~ s/ /_/g;
1050
1051 my ($tailname,$dirname,$suffix);
1052 if($no_suffix) { # given a tailname, no suffix
1053 ($tailname,$dirname) = File::Basename::fileparse($filename);
1054 }
1055 else {
1056 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1057 }
1058 if (!$suffix) {
1059 $suffix = "";
1060 }
1061 # This breaks GLI matching extracted metadata to files in Enrich panel, as
1062 # original is eg .JPG while gsdlsourcefilename ends up .jpg
1063 # Not sure why it was done in first place...
1064 #else {
1065 #$suffix = lc($suffix);
1066 #}
1067
1068 if ($rename_method eq "url") {
1069 $tailname = &unicode::url_encode($tailname);
1070 }
1071 elsif ($rename_method eq "base64") {
1072 $tailname = &unicode::base64_encode($tailname);
1073 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1074 }
1075
1076 $filename = "$tailname$suffix";
1077 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1078
1079 return $filename;
1080}
1081
1082
1083# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1084sub rename_ldb_or_bdb_file {
1085 my ($filename_no_ext) = @_;
1086
1087 my $new_filename = "$filename_no_ext.gdb";
1088 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1089 # try ldb
1090 my $old_filename = "$filename_no_ext.ldb";
1091
1092 if (-f $old_filename) {
1093 print STDERR "Renaming $old_filename to $new_filename\n";
1094 rename ($old_filename, $new_filename)
1095 || print STDERR "Rename failed: $!\n";
1096 return;
1097 }
1098 # try bdb
1099 $old_filename = "$filename_no_ext.bdb";
1100 if (-f $old_filename) {
1101 print STDERR "Renaming $old_filename to $new_filename\n";
1102 rename ($old_filename, $new_filename)
1103 || print STDERR "Rename failed: $!\n";
1104 return;
1105 }
1106}
1107
1108sub os_dir() {
1109
1110 my $gsdlarch = "";
1111 if(defined $ENV{'GSDLARCH'}) {
1112 $gsdlarch = $ENV{'GSDLARCH'};
1113 }
1114 return $ENV{'GSDLOS'}.$gsdlarch;
1115}
1116
1117# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1118# By default, /greenstone3 for GS3 or /greenstone for GS2.
1119sub get_greenstone_url_prefix() {
1120 # if already set on a previous occasion, just return that
1121 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1122 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1123
1124 my ($configfile, $urlprefix, $defaultUrlprefix);
1125 my @propertynames = ();
1126
1127 if($ENV{'GSDL3SRCHOME'}) {
1128 $defaultUrlprefix = "/greenstone3";
1129 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1130 push(@propertynames, qw/path\s*\=/);
1131 } else {
1132 $defaultUrlprefix = "/greenstone";
1133 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1134 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1135 }
1136
1137 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1138
1139 if(!$urlprefix) { # no values found for URL prefix, use default values
1140 $urlprefix = $defaultUrlprefix;
1141 } else {
1142 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1143 $urlprefix =~ s/^\///; # remove the starting slash
1144 my @dirs = split(/(\\|\/)/, $urlprefix);
1145 $urlprefix = shift(@dirs);
1146
1147 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1148 $urlprefix = "/$urlprefix";
1149 }
1150 }
1151
1152 # set for the future
1153 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1154# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1155 return $urlprefix;
1156}
1157
1158
1159
1160#
1161# The following comaes from activate.pl
1162#
1163# Designed to work with a server included with GS.
1164# - For GS3, we ask ant for the library URL.
1165# - For GS2, we derive the URL from the llssite.cfg file.
1166
1167sub get_full_greenstone_url_prefix
1168{
1169 # if already set on a previous occasion, just return that
1170 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1171 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1172
1173 my $gs_mode = ($ENV{'GSDL3SRCHOME'}) ? "gs3" : "gs2";
1174
1175 my $url = undef;
1176
1177 if($gs_mode eq "gs2") {
1178 my $llssite_cfg = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "llssite.cfg");
1179
1180 if(-f $llssite_cfg) {
1181 # check llssite.cfg for line with url property
1182 # for server.exe also need to use portnumber and enterlib properties
1183
1184 # Read in the entire contents of the file in one hit
1185 if (!open (FIN, $llssite_cfg)) {
1186 print STDERR "util::get_library_URL() failed to open $llssite_cfg ($!)\n";
1187 return undef;
1188 }
1189
1190 my $contents;
1191 sysread(FIN, $contents, -s FIN);
1192 close(FIN);
1193
1194 my @lines = split(/[\n\r]+/, $contents); # split on carriage-returns and/or linefeeds
1195 my $enterlib = "";
1196 my $portnumber = ""; # will remain empty (implicit port 80) unless it's specifically been assigned
1197
1198 foreach my $line (@lines) {
1199 if($line =~ m/^url=(.*)$/) {
1200 $url = $1;
1201 } elsif($line =~ m/^enterlib=(.*)$/) {
1202 $enterlib = $1;
1203 } elsif($line =~ m/^portnumber=(.*)$/) {
1204 $portnumber = $1;
1205 }
1206 }
1207
1208 if(!$url) {
1209 return undef;
1210 }
1211 elsif($url eq "URL_pending") { # library is not running
1212 # do not process url=URL_pending in the file, since for server.exe
1213 # this just means the Enter Library button hasn't been pressed yet
1214 $url = undef;
1215 }
1216 else {
1217 # In the case of server.exe, need to do extra work to get the proper URL
1218 # But first, need to know whether we're indeed dealing with server.exe:
1219
1220 # compare the URL's domain to the full URL
1221 # E.g. for http://localhost:8383/greenstone3/cgi-bin, the domain is localhost:8383
1222 my $uri = URI->new( $url );
1223 my $host = $uri->host;
1224 #print STDERR "@@@@@ host: $host\n";
1225 if($url =~ m/http:\/\/$host(\/)?$/) {
1226 #if($url !~ m/http:\/\/$host:$portnumber(\/)?/ || $url =~ m/http:\/\/$host(\/)?$/) {
1227 # (if the URL does not contain the portnumber, OR if the port is implicitly 80 and)
1228 # If the domain with http:// prefix is completely the same as the URL, assume server.exe
1229 # then the actual URL is the result of suffixing the port and enterlib properties in llssite.cfg
1230 $url = $url.":".$portnumber.$enterlib;
1231 } # else, apache web server
1232
1233 }
1234 }
1235 } elsif($gs_mode eq "gs3") {
1236 # Either check build.properties for tomcat.server, tomcat.port and app.name (and default servlet name).
1237 # app.name is stored in app.path by build.xml. Need to move app.name in build.properties from build.xml
1238
1239 # Or, run the new target get-default-servlet-url
1240 # the output can look like:
1241 #
1242 # Buildfile: build.xml
1243 # [echo] os.name: Windows Vista
1244 #
1245 # get-default-servlet-url:
1246 # [echo] http://localhost:8383/greenstone3/library
1247 # BUILD SUCCESSFUL
1248 # Total time: 0 seconds
1249
1250 #my $output = qx/ant get-default-servlet-url/; # backtick operator, to get STDOUT (else 2>&1)
1251 # - see http://stackoverflow.com/questions/799968/whats-the-difference-between-perls-backticks-system-and-exec
1252
1253 # The get-default-servlet-url ant target can be run from anywhere by specifying the
1254 # location of GS3's ant build.xml buildfile. Activate.pl can be run from anywhere for GS3
1255 # GSDL3SRCHOME will be set for GS3 by gs3-setup.sh, a step that would have been necessary
1256 # to run the activate.pl script in the first place
1257
1258 my $full_build_xml = &FileUtils::javaFilenameConcatenate($ENV{'GSDL3SRCHOME'},"build.xml");
1259
1260 my $perl_command = "ant -buildfile \"$full_build_xml\" get-default-servlet-url";
1261
1262 if (open(PIN, "$perl_command |")) {
1263 while (defined (my $perl_output_line = <PIN>)) {
1264
1265 if($perl_output_line =~ m@http:\/\/(\S*)@) { # grab all the non-whitespace chars
1266 $url="http://".$1;
1267 }
1268 }
1269 close(PIN);
1270 } else {
1271 print STDERR "util::get_library_URL() failed to run $perl_command to work out library URL for $gs_mode\n";
1272 }
1273 }
1274
1275 # either the url is still undef or it is now set
1276 #print STDERR "\n@@@@@ final URL:|$url|\n" if $url;
1277 #print STDERR "\n@@@@@ URL still undef\n" if !$url;
1278
1279 $ENV{'FULL_GREENSTONE_URL_PREFIX'} = $url;
1280
1281 return $url;
1282}
1283
1284
1285# Given a config file (xml or java properties file) and a list/array of regular expressions
1286# that represent property names to match on, this function will return the value for the 1st
1287# matching property name. If the return value is undefined, no matching property was found.
1288sub extract_propvalue_from_file() {
1289 my ($configfile, $propertynames) = @_;
1290
1291 my $value;
1292 unless(open(FIN, "<$configfile")) {
1293 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1294 return $value; # not initialised
1295 }
1296
1297 # Read the entire file at once, as one single line, then close it
1298 my $filecontents;
1299 {
1300 local $/ = undef;
1301 $filecontents = <FIN>;
1302 }
1303 close(FIN);
1304
1305 foreach my $regex (@$propertynames) {
1306 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1307 if($value) {
1308 $value =~ s/^\"//; # remove any startquotes
1309 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1310 last; # found value for a matching property, break from loop
1311 }
1312 }
1313
1314 return $value;
1315}
1316
1317# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1318# given that perllib is in @INC in order to invoke this subroutine.
1319# Call as follows -- after setting up INC to include perllib and
1320# after setting up GSDLHOME and GSDLOS:
1321#
1322# require util;
1323# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1324#
1325sub setup_greenstone_env() {
1326 my ($GSDLHOME, $GSDLOS) = @_;
1327
1328 #my %env_map = ();
1329 # Get the localised ENV settings of running a localised source setup.bash
1330 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1331 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1332 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1333 if (($GSDLOS =~ m/windows/i) && ($^O ne "cygwin")) {
1334 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1335 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1336 }
1337 if (!open(PIN, "$perl_command |")) {
1338 print STDERR ("Unable to execute command: $perl_command. $!\n");
1339 }
1340
1341 while (defined (my $perl_output_line = <PIN>)) {
1342 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1343 #$env_map{$key}=$value;
1344 $ENV{$key}=$value;
1345 }
1346 close (PIN);
1347
1348 # If any keys in $ENV don't occur in Greenstone's localised env
1349 # (stored in $env_map), delete those entries from $ENV
1350 #foreach $key (keys %ENV) {
1351 # if(!defined $env_map{$key}) {
1352 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1353 # delete $ENV{$key}; # del $ENV(key, value) pair
1354 # }
1355 #}
1356 #undef %env_map;
1357}
1358
1359sub get_perl_exec() {
1360 my $perl_exec = $^X; # may return just "perl"
1361
1362 if($ENV{'PERLPATH'}) {
1363 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1364 if (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin")) {
1365 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1366 } else {
1367 $perl_exec = "$ENV{'PERLPATH'}/perl";
1368 }
1369 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1370 # containing the full path to the current perl executable we're using
1371 $perl_exec = $Config{perlpath}; # configured path for perl
1372 if (!-e $perl_exec) { # may not point to location on this machine
1373 $perl_exec = $^X; # may return just "perl"
1374 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1375 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1376 }
1377 }
1378 }
1379
1380 return $perl_exec;
1381}
1382
1383# returns the path to the java command in the JRE included with GS (if any),
1384# quoted to safeguard any spaces in this path, otherwise a simple java
1385# command is returned which assumes and will try for a system java.
1386sub get_java_command {
1387 my $java = "java";
1388 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1389 # after running setup.bat or from GLI which also runs setup.bat
1390 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1391 if(-d $java_bin) {
1392 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1393 $java = "\"".$java."\""; # quoted to preserve spaces in path
1394 }
1395 }
1396 return $java;
1397}
1398
1399
1400# Given the qualified collection name (colgroup/collection),
1401# returns the collection and colgroup parts
1402sub get_collection_parts {
1403 # http://perldoc.perl.org/File/Basename.html
1404 # my($filename, $directories, $suffix) = fileparse($path);
1405 # "$directories contains everything up to and including the last directory separator in the $path
1406 # including the volume (if applicable). The remainder of the $path is the $filename."
1407 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1408
1409 my $qualified_collection = shift(@_);
1410
1411 # Since activate.pl can be launched from the command-line, including by a user,
1412 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1413 # Also allow for the accidental inclusion of multiple slashes
1414 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1415
1416 if(!defined $collection) {
1417 $collection = $colgroup;
1418 $colgroup = "";
1419 }
1420 return ($collection, $colgroup);
1421}
1422
1423# work out the "collectdir/collection" location
1424sub resolve_collection_dir {
1425 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1426
1427 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1428 return $ENV{'GSDLCOLLECTDIR'};
1429 }
1430
1431 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1432
1433 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1434 $collect_dir = &util::get_working_collect_dir($site);
1435 }
1436
1437 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1438}
1439
1440# work out the full path to "collect" of this greenstone 2/3 installation
1441sub get_working_collect_dir {
1442 my ($site) = @_;
1443
1444 if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1445 return $ENV{'GSDLCOLLECTHOME'};
1446 }
1447
1448 if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1449
1450 if (defined $ENV{'GSDL3HOME'}) {
1451 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1452 }
1453 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1454 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1455 }
1456 }
1457
1458 elsif (defined $ENV{'SITEHOME'}) {
1459 return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1460 }
1461
1462 else { # get default collect dir for GS2
1463 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1464 }
1465}
1466
1467sub is_abs_path_any_os {
1468 my ($path) = @_;
1469
1470 # We can have filenames in our DBs that were produced on other OS, so this method exists
1471 # to help identify absolute paths in such cases.
1472
1473 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1474 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1475
1476 return 0;
1477}
1478
1479
1480# This subroutine is for improving portability of Greenstone collections from one OS to another,
1481# to be used to convert absolute paths going into db files into paths with placeholders instead.
1482# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1483# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1484# path with a placeholder to match.
1485# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1486# the path to the current (specific) collection, the path to the general GS collect directory,
1487# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1488# The longest matching prefix will be replaced with the equivalent placeholder:
1489# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1490sub abspath_to_placeholders {
1491 my $path = shift(@_); # path to convert from absolute to one with placeholders
1492 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1493
1494 return $path unless is_abs_path_any_os($path); # path is relative
1495
1496 if ($opt_long_or_short_winfilenames eq "long") {
1497 $path = &util::upgrade_if_dos_filename($path); # will only do something on windows
1498 }
1499
1500 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1501 my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $ENV{'SITEHOME'}, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1502
1503 my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1504 $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1505 $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1506 );
1507 $placeholder_map{$ENV{'SITEHOME'}} = '@SITEHOME@' if defined $ENV{'SITEHOME'};
1508
1509 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1510
1511 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1512 # for windows need to look for matches on short file names too
1513 # matched paths are again to be replaced with the usual placeholders
1514
1515 my $gsdlcollectdir = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'});
1516 my $gsdlcollecthome = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'});
1517 my $sitehome = (defined $ENV{'SITEHOME'}) ? &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) : undef;
1518 my $greenstonehome = &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'});
1519
1520 @gs_paths = ($gsdlcollectdir, $gsdlcollecthome, $sitehome, $greenstonehome); # order matters
1521
1522 %placeholder_map = ($greenstonehome => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1523 $gsdlcollecthome => '@COLLECTHOME@',
1524 $gsdlcollectdir => '@THISCOLLECTPATH@'
1525 );
1526 $placeholder_map{$sitehome} = '@SITEHOME@' if defined $sitehome;
1527
1528 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1529 }
1530
1531 return $path;
1532}
1533
1534sub _abspath_to_placeholders {
1535 my ($path, $gs_paths_ref, $placeholder_map_ref) = @_;
1536
1537 # The sequence of elements in @gs_paths matters
1538 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1539 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1540
1541 foreach my $gs_path (@$gs_paths_ref) {
1542 next if(!defined $gs_path); # site undefined for GS2
1543
1544 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1545
1546 if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1547
1548 my $placeholder = $placeholder_map_ref->{$gs_path}; # get the placeholder to replace the matched path with
1549
1550 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1551 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1552 # lowercase file extension, This is needed when shortfilenames are used, as case affects alphetical ordering, which affects diffcol
1553 $path =~ s/\.([A-Z]+)$/".".lc($1)/e;
1554 last; # done
1555 }
1556 }
1557
1558 return $path;
1559}
1560
1561# Function that does the reverse of the util::abspath_to_placeholders subroutine
1562# Once again, call this with the values returned from util::get_common_gs_paths
1563sub placeholders_to_abspath {
1564 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1565 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1566
1567 return $path if($path !~ m/@/); # path contains no placeholders
1568
1569 # replace placeholders with gs prefixes
1570 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1571 # but listed here from longest to shortest once placeholders are have been resolved
1572
1573 # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1574 my %placeholder_to_gspath_map;
1575 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1576 # always replace placeholders with short file names of the absolute paths on windows?
1577 %placeholder_to_gspath_map = ('@GSDLHOME@' => &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'}),
1578 '@COLLECTHOME@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'}),
1579 '@THISCOLLECTPATH@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'})
1580 );
1581 $placeholder_to_gspath_map{'@SITEHOME@'} = &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) if defined $ENV{'SITEHOME'};
1582 } else {
1583 %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'},
1584 '@SITEHOME@' => $ENV{'SITEHOME'}, # can be undef
1585 '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1586 '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1587 ); # $placeholder_to_gspath_map{'@SITEHOME@'} = $ENV{'SITEHOME'} if defined $ENV{'SITEHOME'};
1588 }
1589
1590 foreach my $placeholder (@placeholders) {
1591 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1592
1593 next if(!defined $gs_path); # sitehome for GS2 is undefined
1594
1595 if($path =~ m/^$placeholder/) {
1596 $path =~ s/^$placeholder/$gs_path/;
1597 last; # done
1598 }
1599 }
1600
1601 return $path;
1602}
1603
1604# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1605# a directory containing sequentially numbered images.
1606sub create_itemfile
1607{
1608 my ($output_dir, $convert_basename, $convert_to) = @_;
1609 my $page_num = "";
1610
1611 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1612 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1613 closedir DIR;
1614
1615 # Sort files in the directory by page_num
1616 sub page_number {
1617 my ($dir) = @_;
1618 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1619# my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1620
1621 $pagenum = 1 unless defined $pagenum;
1622 return $pagenum;
1623 }
1624
1625 # sort the files in the directory in the order of page_num rather than lexically.
1626 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1627
1628 # work out if the numbering of the now sorted image files starts at 0 or not
1629 # by checking the number of the first _image_ file (skipping item files)
1630 my $starts_at_0 = 0;
1631 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1632 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1633 $starts_at_0 = 1;
1634 }
1635
1636 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1637 my $item_fh;
1638 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1639 print $item_fh "<PagedDocument>\n";
1640
1641 foreach my $file (@dir_files){
1642 if ($file !~ /\.item/i){
1643 $page_num = page_number($file);
1644 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1645 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1646 }
1647 }
1648
1649 print $item_fh "</PagedDocument>\n";
1650 &FileUtils::closeFileHandle($item_file, \$item_fh);
1651 return $item_file;
1652}
1653
1654# Sets the gnomelib_env. Based on the logic in wvware.pl which can perhaps be replaced with a call to this function in future
1655sub set_gnomelib_env
1656{
1657 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
1658 # Though this is only needed for darwin Lion at this point (and android, though that is untested)
1659
1660 my $libext = "so";
1661 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1662 return;
1663 } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1664 $libext = "dylib";
1665 }
1666
1667 if (!defined $ENV{'GEXTGNOME'}) {
1668 ##print STDERR "@@@ Setting GEXTGNOME env\n";
1669
1670 my $gnome_dir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext","gnome-lib-minimal");
1671
1672 if(! -d $gnome_dir) {
1673 $gnome_dir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext","gnome-lib");
1674
1675 if(! -d $gnome_dir) {
1676 $gnome_dir = "";
1677 }
1678 }
1679
1680 # now set other the related env vars,
1681 # IF we've found the gnome-lib dir installed in the ext folder
1682
1683 if ($gnome_dir ne "" && -f &FileUtils::filenameConcatenate($gnome_dir, $ENV{'GSDLOS'}, "lib", "libiconv.$libext")) {
1684 $ENV{'GEXTGNOME'} = $gnome_dir;
1685 $ENV{'GEXTGNOME_INSTALLED'}=&FileUtils::filenameConcatenate($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
1686
1687 my $gnomelib_bin = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "bin");
1688 if(-d $gnomelib_bin) { # no bin subfolder in GS binary's cutdown gnome-lib-minimal folder
1689 &util::envvar_prepend("PATH", $gnomelib_bin);
1690 }
1691
1692 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
1693 my $gextlib = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "lib");
1694
1695 if($ENV{'GSDLOS'} eq "linux") {
1696 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
1697 }
1698 elsif ($ENV{'GSDLOS'} eq "darwin") {
1699 #&util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
1700 &util::envvar_prepend("DYLD_FALLBACK_LIBRARY_PATH", $gextlib);
1701 }
1702 }
1703
1704 # Above largely mimics the setup.bash of the gnome-lib-minimal.
1705 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
1706 # Not exporting GSDLEXTS variable either
1707 }
1708
1709# print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
1710# print STDERR "\tPATH".$ENV{'PATH'}."\n";
1711# print STDERR "\tLD_LIB_PATH".$ENV{'LD_LIBRARY_PATH'}."\n" if $ENV{'LD_LIBRARY_PATH};
1712# print STDERR "\tDYLD_FALLBACK_LIB_PATH".$ENV{'DYLD_FALLBACK_LIBRARY_PATH'}."\n" if $ENV{'DYLD_FALLBACK_LIBRARY_PATH};
1713
1714 # if no GEXTGNOME, maybe users didn't need gnome-lib to run gnomelib/libiconv dependent binaries like hashfile, suffix, wget
1715 # (wvware is launched in a gnomelib env from its own script, but could possibly go through this script in future)
1716}
1717
1718
1719
1720## @function augmentINC()
1721#
1722# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1723# @param $new_path The path to add
1724# @author jmt12
1725#
1726sub augmentINC
1727{
1728 my ($new_path) = @_;
1729 my $did_add_path = 0;
1730 # might need to be replaced with FileUtils::directoryExists() call eventually
1731 if (-d $new_path)
1732 {
1733 my $did_find_path = 0;
1734 foreach my $existing_path (@INC)
1735 {
1736 if ($existing_path eq $new_path)
1737 {
1738 $did_find_path = 1;
1739 last;
1740 }
1741 }
1742 if (!$did_find_path)
1743 {
1744 unshift(@INC, $new_path);
1745 $did_add_path = 1;
1746 }
1747 }
1748 return $did_add_path;
1749}
1750## augmentINC()
1751
1752
17531;
Note: See TracBrowser for help on using the repository browser.