source: main/trunk/greenstone2/perllib/util.pm@ 37736

Last change on this file since 37736 was 37736, checked in by davidb, 12 months ago

When testing incremental addition with a manifest file on Linux, it was found that the 'lc()' RE caused issues with working out which files have previously been seen or not. The test collection happened to use .JPG for its images. The resulted in all the images being reprocessed again, as they did not match what was stored in the archive's DB file. Based on the comment that was already there that detailed wy the lc() was being used (to do with testing on Windows with diffcol), it was decided to let the lc() remain if on Windows but to not do this when on non-Windows systems (Linux, MacOS) as these file systems definitely are case-sensitive (whereas Windows is typically case sensitive these days, but not always, especially if we are changing filenames back to DOS shortened versions, as is done in part of the Greenstone Perl code)

  • Property svn:keywords set to Author Date Id Revision
File size: 69.6 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29no strict 'refs'; # make an exception so we can use variables as filehandles
30use FileUtils;
31
32use Encode;
33use Unicode::Normalize 'normalize';
34
35use File::Copy;
36use File::Basename;
37# Config for getting the perlpath in the recommended way, though it uses paths that are
38# hard-coded into the Config file that's generated upon configuring and compiling perl.
39# $^X works better in some cases to return the path to perl used to launch the script,
40# but if launched with plain "perl" (no full-path), that will be just what it returns.
41use Config;
42# New module for file related utility functions - intended as a
43# placeholder for an extension that allows a variety of different
44# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
45use FileUtils;
46
47if ($ENV{'GSDLOS'} =~ /^windows$/i) {
48 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
49}
50
51# removes files (but not directories)
52sub rm {
53 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
54 return &FileUtils::removeFiles(@_);
55}
56
57# recursive removal
58sub filtered_rm_r {
59 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
60 return &FileUtils::removeFilesFiltered(@_);
61}
62
63# recursive removal
64sub rm_r {
65 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
66 return &FileUtils::removeFilesRecursive(@_);
67}
68
69# moves a file or a group of files
70sub mv {
71 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
72 return &FileUtils::moveFiles(@_);
73}
74
75# Move the contents of source directory into target directory
76# (as opposed to merely replacing target dir with the src dir)
77# This can overwrite any files with duplicate names in the target
78# but other files and folders in the target will continue to exist
79sub mv_dir_contents {
80 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
81 return &FileUtils::moveDirectoryContents(@_);
82}
83
84# copies a file or a group of files
85sub cp {
86 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
87 return &FileUtils::copyFiles(@_);
88}
89
90# recursively copies a file or group of files
91# syntax: cp_r (sourcefiles, destination directory)
92# destination must be a directory - to copy one file to
93# another use cp instead
94sub cp_r {
95 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
96 return &FileUtils::copyFilesRecursive(@_);
97}
98
99# recursively copies a file or group of files
100# syntax: cp_r (sourcefiles, destination directory)
101# destination must be a directory - to copy one file to
102# another use cp instead
103sub cp_r_nosvn {
104 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
105 return &FileUtils::copyFilesRecursiveNoSVN(@_);
106}
107
108# copies a directory and its contents, excluding subdirectories, into a new directory
109sub cp_r_toplevel {
110 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
111 return &FileUtils::recursiveCopyTopLevel(@_);
112}
113
114sub mk_dir {
115 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
116 return &FileUtils::makeDirectory(@_);
117}
118
119# in case anyone cares - I did some testing (using perls Benchmark module)
120# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
121# slightly faster (surprisingly) - Stefan.
122sub mk_all_dir {
123 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
124 return &FileUtils::makeAllDirectories(@_);
125}
126
127# make hard link to file if supported by OS, otherwise copy the file
128sub hard_link {
129 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
130 return &FileUtils::hardLink(@_);
131}
132
133# make soft link to file if supported by OS, otherwise copy file
134sub soft_link {
135 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
136 return &FileUtils::softLink(@_);
137}
138
139# Primarily for filenames generated by processing
140# content of HTML files (which are mapped to UTF-8 internally)
141#
142# To turn this into an octet string that really exists on the file
143# system:
144# 1. don't need to do anything special for Unix-based systems
145# (as underlying file system is byte-code)
146# 2. need to map to short DOS filenames for Windows
147
148sub utf8_to_real_filename
149{
150 my ($utf8_filename) = @_;
151
152 my $real_filename;
153
154 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
155 require Win32;
156
157 my $unicode_filename = decode("utf8",$utf8_filename);
158 $real_filename = Win32::GetShortPathName($unicode_filename);
159 }
160 else {
161 $real_filename = $utf8_filename;
162 }
163
164 return $real_filename;
165}
166
167sub raw_filename_to_unicode
168{
169 my ($directory, $raw_file, $filename_encoding ) = @_;
170
171 my $unicode_filename = $raw_file;
172 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
173 # Try turning a short version to the long version
174 # If there are "funny" characters in the file name, that can't be represented in the ANSI code, then we will have a short weird version, eg E74~1.txt
175 $unicode_filename = &util::get_dirsep_tail(&util::upgrade_if_dos_filename(&FileUtils::filenameConcatenate($directory, $raw_file), 0));
176
177
178 if ($unicode_filename eq $raw_file) {
179 # This means the original filename *was* able to be encoded in the local ANSI file encoding (eg windows_1252), so now we turn it back to perl's unicode
180
181 $unicode_filename = &Encode::decode(locale_fs => $unicode_filename);
182 }
183 # else This means we did have one of the funny filenames. the getLongPathName (used in upgrade_if_dos_filename) will return unicode, so we don't need to do anything more.
184
185
186 } else {
187 # we had a utf-8 string, turn it into perl internal unicode
188 $unicode_filename = &Encode::decode("utf-8", $unicode_filename);
189
190
191 }
192 #Does the filename have url encoded chars in it?
193 if (&unicode::is_url_encoded($unicode_filename)) {
194 $unicode_filename = &unicode::url_decode($unicode_filename);
195 }
196
197 # Normalise the filename to canonical composition - on mac, filenames use decopmposed form for accented chars
198 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
199 $unicode_filename = normalize('C', $unicode_filename); # Composed form 'C'
200 }
201 return $unicode_filename;
202
203}
204sub fd_exists {
205 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
206 return &FileUtils::fileTest(@_);
207}
208
209sub file_exists {
210 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
211 return &FileUtils::fileExists(@_);
212}
213
214sub dir_exists {
215 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
216 return &FileUtils::directoryExists(@_);
217}
218
219# updates a copy of a directory in some other part of the filesystem
220# verbosity settings are: 0=low, 1=normal, 2=high
221# both $fromdir and $todir should be absolute paths
222sub cachedir {
223 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
224 return &FileUtils::synchronizeDirectories(@_);
225}
226
227# this function returns -1 if either file is not found
228# assumes that $file1 and $file2 are absolute file names or
229# in the current directory
230# $file2 is allowed to be newer than $file1
231sub differentfiles {
232 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
233 return &FileUtils::differentFiles(@_);
234}
235
236
237# works out the temporary directory, including in the case where Greenstone is not writable
238# In that case, gs3-setup.bat would already have set the GS_TMP_OUTPUT_DIR temp variable
239sub determine_tmp_dir
240{
241 my $try_collect_dir = shift(@_) || 0;
242
243 my $tmp_dirname;
244 if(defined $ENV{'GS_TMP_OUTPUT_DIR'}) {
245 $tmp_dirname = $ENV{'GS_TMP_OUTPUT_DIR'};
246 } elsif($try_collect_dir && defined $ENV{'GSDLCOLLECTDIR'}) {
247 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
248 } elsif(defined $ENV{'GSDLHOME'}) {
249 $tmp_dirname = $ENV{'GSDLHOME'};
250 } else {
251 return undef;
252 }
253
254 if(!defined $ENV{'GS_TMP_OUTPUT_DIR'}) {
255 # test the tmp_dirname folder is writable, by trying to write out a file
256 # Unfortunately, cound not get if(-w $dirname) to work on directories on Windows
257 ## http://alvinalexander.com/blog/post/perl/perl-file-test-operators-reference-cheat-sheet (test file/dir writable)
258 ## http://www.makelinux.net/alp/083 (real and effective user IDs)
259
260 my $tmp_test_file = &FileUtils::filenameConcatenate($tmp_dirname, "writability_test.tmp");
261 if (open (FOUT, ">$tmp_test_file")) {
262 close(FOUT);
263 &FileUtils::removeFiles($tmp_test_file);
264 } else { # location not writable, use TMP location
265 if (defined $ENV{'TMP'}) {
266 $tmp_dirname = $ENV{'TMP'};
267 } else {
268 $tmp_dirname = "/tmp";
269 }
270 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "greenstone");
271 $ENV{'GS_TMP_OUTPUT_DIR'} = $tmp_dirname; # store for next time
272 }
273 }
274
275 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
276 &FileUtils::makeAllDirectories ($tmp_dirname) unless -e $tmp_dirname;
277
278 return $tmp_dirname;
279}
280
281sub get_tmp_filename
282{
283 my $file_ext = shift(@_) || undef;
284
285 my $opt_dot_file_ext = "";
286 if (defined $file_ext) {
287 if ($file_ext !~ m/\./) {
288 # no dot, so needs one added in at start
289 $opt_dot_file_ext = ".$file_ext"
290 }
291 else {
292 # allow for "extensions" such as _metadata.txt to be handled
293 # gracefully
294 $opt_dot_file_ext = $file_ext;
295 }
296 }
297
298 my $tmpdir = &util::determine_tmp_dir(0);
299
300 my $count = 1000;
301 my $rand = int(rand $count);
302 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
303
304 while (-e $full_tmp_filename) {
305 $rand = int(rand $count);
306 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
307 $count++;
308 }
309
310 return $full_tmp_filename;
311}
312
313# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
314# belong with that function. They help ensure the timestamped tmp folders generated are unique.
315my $previous_timestamp = undef;
316my $previous_timestamp_f = 0; # frequency
317
318sub get_timestamped_tmp_folder
319{
320 my $tmp_dirname = &util::determine_tmp_dir(1);
321
322 # add the timestamp into the path otherwise we can run into problems
323 # if documents have the same name
324 my $timestamp = time;
325
326 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
327 $previous_timestamp_f = 0;
328 $previous_timestamp = $timestamp;
329 } else {
330 $previous_timestamp_f++;
331 }
332
333 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
334 $tmp_dirname = $time_tmp_dirname;
335 my $i = $previous_timestamp_f;
336
337 if($previous_timestamp_f > 0) {
338 $tmp_dirname = $time_tmp_dirname."_".$i;
339 $i++;
340 }
341 while (-e $tmp_dirname) {
342 $tmp_dirname = $time_tmp_dirname."_".$i;
343 $i++;
344 }
345 &FileUtils::makeDirectory($tmp_dirname);
346
347 return $tmp_dirname;
348}
349
350sub get_timestamped_tmp_filename_in_collection
351{
352
353 my ($input_filename, $output_ext) = @_;
354 # derive tmp filename from input filename
355 my ($tailname, $dirname, $suffix)
356 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
357
358 # softlink to collection tmp dir
359 my $tmp_dirname = &util::get_timestamped_tmp_folder();
360 $tmp_dirname = $dirname unless defined $tmp_dirname;
361
362 # following two steps copied from ConvertBinaryFile
363 # do we need them?? can't use them as is, as they use plugin methods.
364
365 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
366
367 # URLEncode this since htmls with images where the html filename is utf8 don't seem
368 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
369 # files on the filesystem.
370 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
371 if (defined $output_ext) {
372 $output_ext = ".$output_ext"; # add the dot
373 } else {
374 $output_ext = $suffix;
375 }
376 $output_ext= lc($output_ext);
377 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
378
379 return $tmp_filename;
380}
381
382sub get_toplevel_tmp_dir
383{
384 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
385}
386
387
388sub get_collectlevel_tmp_dir
389{
390 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
391 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
392
393 return $tmp_dirname;
394}
395
396sub get_parent_folder
397{
398 my ($path) = @_;
399 my ($tailname, $dirname, $suffix)
400 = &File::Basename::fileparse($path, "\\.[^\\.]+\$");
401
402 return &FileUtils::sanitizePath($dirname);
403}
404
405sub filename_to_regex {
406 my $filename = shift (@_);
407
408 # need to make single backslashes double so that regex works
409 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
410
411 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
412 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
413 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
414 $filename =~ s@\(@\\(@g; # escape brackets
415 $filename =~ s@\)@\\)@g; # escape brackets
416 $filename =~ s@\[@\\[@g; # escape brackets
417 $filename =~ s@\]@\\]@g; # escape brackets
418
419 return $filename;
420}
421
422sub unregex_filename {
423 my $filename = shift (@_);
424
425 # need to put doubled backslashes for regex back to single
426 $filename =~ s/\\\./\./g; # remove RE syntax for .
427 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
428 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
429 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
430 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
431
432 # \\ goes to \
433 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
434 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
435 return $filename;
436}
437
438sub filename_cat {
439 # I've disabled this warning for now, as every Greenstone perl
440 # script seems to make use of this function and so you drown in a
441 # sea of deprecated warnings [jmt12]
442 # warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
443 return &FileUtils::filenameConcatenate(@_);
444}
445
446
447sub _pathname_cat {
448 my $join_char = shift(@_);
449 my $first_path = shift(@_);
450 my (@pathnames) = @_;
451
452 # If first_path is not null or empty, then add it back into the list
453 if (defined $first_path && $first_path =~ /\S/) {
454 unshift(@pathnames, $first_path);
455 }
456
457 my $pathname = join($join_char, @pathnames);
458
459 # remove duplicate slashes
460 if ($join_char eq ";") {
461 $pathname =~ s/[\\\/]+/\\/g;
462 if ($^O eq "cygwin") {
463 # Once we've collapsed muliple (potentialy > 2) slashes
464 # For cygwin, actually want things double-backslahed
465 $pathname =~ s/\\/\\\\/g;
466 }
467
468 } else {
469 $pathname =~ s/[\/]+/\//g;
470 # DB: want a pathname abc\de.html to remain like this
471 }
472
473 return $pathname;
474}
475
476
477sub pathname_cat {
478 my (@pathnames) = @_;
479
480 my $join_char;
481 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
482 $join_char = ";";
483 } else {
484 $join_char = ":";
485 }
486 return _pathname_cat($join_char,@pathnames);
487}
488
489
490sub javapathname_cat {
491 my (@pathnames) = @_;
492
493 my $join_char;
494
495 # Unlike pathname_cat() above, not interested if running in a Cygwin environment
496 # This is because the java we run is actually a native Windows executable
497
498 if (($ENV{'GSDLOS'} =~ /^windows$/i)) {
499 $join_char = ";";
500 } else {
501 $join_char = ":";
502 }
503 return _pathname_cat($join_char,@pathnames);
504}
505
506
507sub makeFilenameJavaCygwinCompatible
508{
509 my ($java_filename) = @_;
510
511 if ($^O eq "cygwin") {
512 # To be used with a Java program, but under Cygwin
513 # Because the java binary that is native to Windows, need to
514 # convert the Cygwin paths (i.e. Unix style) to be Windows
515 # compatible
516
517 $java_filename = `cygpath -wp "$java_filename"`;
518 chomp($java_filename);
519 $java_filename =~ s%\\%\\\\%g;
520 }
521
522 return $java_filename;
523}
524
525sub tidy_up_oid {
526 my ($OID) = @_;
527 if ($OID =~ /[\.\/\\]/) {
528 print STDERR "Warning, identifier $OID contains periods or slashes(.\\\/), replacing them with _\n";
529 $OID =~ s/[\.\\\/]/_/g; #replace periods and slasshes with _
530 }
531 if ($OID =~ /^\s.*\s$/) {
532 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
533 # remove starting and trailing whitespace
534 $OID =~ s/^\s+//;
535 $OID =~ s/\s+$//;
536 }
537 if ($OID =~ /\s/) {
538 print STDERR "Warning, identifier $OID contains whitespace. Replacing it with _\n";
539 $OID =~ s/\s+/_/g; # replace whitespace with _
540 }
541 if ($OID =~ /^[\d]*$/) {
542 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
543 $OID = "D" . $OID;
544 }
545
546 return $OID;
547}
548
549sub envvar_prepend {
550 my ($var,$val) = @_;
551
552 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
553 ## my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
554
555 # Rewritten above to make ":" the default (Windows is the special
556 # case, anything else 'unusual' such as Solaris etc is Unix)
557 my $pathsep = (defined $ENV{'GSDLOS'} && (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin"))) ? ";" : ":";
558
559 # do not prepend any value/path that's already in the environment variable
560
561 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
562 if (!defined($ENV{$var})) {
563 $ENV{$var} = "$val";
564 }
565 elsif($ENV{$var} !~ m/$escaped_val/) {
566 $ENV{$var} = "$val".$pathsep.$ENV{$var};
567 }
568}
569
570sub envvar_append {
571 my ($var,$val) = @_;
572
573 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
574 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
575
576 # do not append any value/path that's already in the environment variable
577
578 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
579 if (!defined($ENV{$var})) {
580 $ENV{$var} = "$val";
581 }
582 elsif($ENV{$var} !~ m/$escaped_val/) {
583 $ENV{$var} = $ENV{$var}.$pathsep."$val";
584 }
585}
586
587# debug aid
588sub print_env {
589 my ($handle, @envvars) = @_; # print to $handle, which can be STDERR/STDOUT/file, etc.
590
591 if (scalar(@envvars) == 0) {
592 #print $handle "@@@ All env vars requested\n";
593
594 my $output = "";
595
596 print $handle "@@@ Environment was:\n********\n";
597 foreach my $envvar (sort keys(%ENV)) {
598 if(defined $ENV{$envvar}) {
599 print $handle "\t$envvar = $ENV{$envvar}\n";
600 } else {
601 print $handle "\t$envvar = \n";
602 }
603 }
604 print $handle "********\n";
605 } else {
606 print $handle "@@@ Environment was:\n********\n";
607 foreach my $envvar (@envvars) {
608 if(defined $ENV{$envvar}) {
609 print $handle "\t$envvar = ".$ENV{$envvar}."\n";
610 } else {
611 print $handle "Env var '$envvar' was not set\n";
612 }
613 }
614 print $handle "********\n";
615 }
616}
617
618
619# splits a filename into a prefix and a tail extension using the tail_re, or
620# if that fails, splits on the file_extension . (dot)
621sub get_prefix_and_tail_by_regex {
622
623 my ($filename,$tail_re) = @_;
624
625 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
626 if ((!defined $file_prefix) || (!defined $file_ext)) {
627 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
628 }
629
630 return ($file_prefix,$file_ext);
631}
632
633# get full path and file only path from a base_dir (which may be empty) and
634# file (which may contain directories)
635sub get_full_filenames {
636 my ($base_dir, $file) = @_;
637
638 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
639 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
640 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
641
642
643 my $filename_full_path = $file;
644 # add on directory if present
645 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
646
647 my $filename_no_path = $file;
648
649 # remove directory if present
650 $filename_no_path =~ s/^.*[\/\\]//;
651 return ($filename_full_path, $filename_no_path);
652}
653
654# returns the path of a file without the filename -- ie. the directory the file is in
655sub filename_head {
656 my $filename = shift(@_);
657
658 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
659 $filename =~ s/[^\\\\]*$//;
660 }
661 else {
662 $filename =~ s/[^\\\/]*$//;
663 }
664
665 return $filename;
666}
667
668# Debug function to print the caller at the provided depth or else depth=1 (to skip the function
669# that called this one, which is at depth 0).
670sub debug_print_caller {
671 my $depth = shift(@_);
672 $depth = 1 unless $depth; # start at 1 to skip printing the function that called this one
673 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller($depth);
674 my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
675 print STDERR "** Calling method at depth $depth: $lcfilename:$cline $cpackage->$csubr\n";
676}
677
678# Debug function to print the call stack.
679# Optional param maxdepth: how many callers up the stack to print, *besides* this function's own
680# caller. If maxdepth parameter unspecified, prints the entire call stack.
681sub debug_print_call_stack {
682 my $maxdepth = shift(@_);
683 if($maxdepth) {
684 print STDERR "** CALL STACK UP TO AND INCL. MAX DEPTH OF $maxdepth:\n";
685 } else {
686 print STDERR "** FULL CALL STACK:\n";
687 }
688
689 my $depth = 0; # start by just printing this sub's calling function too
690 while(1) {
691 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller($depth);
692 last unless (defined $cfilename && defined $cline && defined $cpackage); # when call stack printed in full
693 my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
694 print STDERR "\t$lcfilename:$cline $cpackage->$csubr\n";
695 $depth++;
696 # print out caller at $maxdepth too, even though $depth starts at 0
697 # So this method prints out maxdepth+1 callers
698 last if($maxdepth && $depth > $maxdepth);
699 }
700 return "";
701}
702
703
704# returns 1 if filename1 and filename2 point to the same
705# file or directory
706sub filenames_equal {
707 my ($filename1, $filename2) = @_;
708
709 # use filename_cat to clean up trailing slashes and
710 # multiple slashes
711 $filename1 = &FileUtils::filenameConcatenate($filename1);
712 $filename2 = &FileUtils::filenameConcatenate($filename2);
713
714 # filenames not case sensitive on windows
715 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
716 $filename1 =~ tr/[A-Z]/[a-z]/;
717 $filename2 =~ tr/[A-Z]/[a-z]/;
718 }
719 return 1 if $filename1 eq $filename2;
720 return 0;
721}
722
723# If filename is relative to within_dir, returns the relative path of filename to that directory
724# with slashes in the filename returned as they were in the original (absolute) filename.
725sub filename_within_directory
726{
727 my ($filename,$within_dir) = @_;
728
729 if ($within_dir !~ m/[\/\\]$/) {
730 my $dirsep = &util::get_dirsep();
731 $within_dir .= $dirsep;
732 }
733
734 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
735 if ($filename =~ m/^$within_dir(.*)$/) {
736 $filename = $1;
737 }
738
739 return $filename;
740}
741
742# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
743# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
744# The subpath returned will also be a URL type filename.
745sub filename_within_directory_url_format
746{
747 my ($filename,$within_dir) = @_;
748
749 # convert parameters only to / slashes if Windows
750
751 my $filename_urlformat = &filepath_to_url_format($filename);
752 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
753
754 #if ($within_dir_urlformat !~ m/\/$/) {
755 # make sure directory ends with a slash
756 #$within_dir_urlformat .= "/";
757 #}
758
759 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
760
761 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
762
763 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
764 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
765 $filename_urlformat = $1;
766 }
767
768 return $filename_urlformat;
769}
770
771# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
772# since on Linux it doesn't represent a file separator but an escape char).
773sub filepath_to_url_format
774{
775 my ($filepath) = @_;
776 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
777 # Only need to worry about Windows, as Unix style directories already in url-format
778 # Convert Windows style \ => /
779 $filepath =~ s@\\@/@g;
780 }
781 return $filepath;
782}
783
784# regex filepaths on windows may include \\ as path separator. Convert \\ to /
785sub filepath_regex_to_url_format
786{
787 my ($filepath) = @_;
788 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
789 # Only need to worry about Windows, as Unix style directories already in url-format
790 # Convert Windows style \\ => /
791 $filepath =~ s@\\\\@/@g;
792 }
793 return $filepath;
794
795}
796
797# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
798# and ignores trailing /
799# returns (file, dirs) dirs will be empty if no subdirs
800sub url_fileparse
801{
802 my ($filepath) = @_;
803 # remove trailing /
804 $filepath =~ s@/$@@;
805 if ($filepath !~ m@/@) {
806 return ($filepath, "");
807 }
808 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
809 return ($file, $dirs);
810
811}
812
813
814sub filename_within_collection
815{
816 my ($filename) = @_;
817
818 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
819
820 if (defined $collect_dir) {
821
822 # if from within GSDLCOLLECTDIR, then remove directory prefix
823 # so source_filename is realative to it. This is done to aid
824 # portability, i.e. the collection can be moved to somewhere
825 # else on the file system and the archives directory will still
826 # work. This is needed, for example in the applet version of
827 # GLI where GSDLHOME/collect on the server will be different to
828 # the collect directory of the remove user. Of course,
829 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
830 # it back into a full pathname.
831
832 $filename = filename_within_directory($filename,$collect_dir);
833 }
834
835 return $filename;
836}
837
838sub prettyprint_file
839{
840 my ($base_dir,$file,$gli) = @_;
841
842 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
843
844 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
845 require Win32;
846
847 # For some reason base_dir in the form c:/a/b/c
848 # This leads to confusion later on, so turn it back into
849 # the more usual Windows form
850 $base_dir =~ s/\//\\/g;
851 my $long_base_dir = Win32::GetLongPathName($base_dir);
852 my $long_full_path = Win32::GetLongPathName($filename_full_path);
853
854 $file = filename_within_directory($long_full_path,$long_base_dir);
855 $file = encode("utf8",$file) if ($gli);
856 }
857
858 return $file;
859}
860
861
862sub upgrade_if_dos_filename
863{
864 my ($filename_full_path,$and_encode) = @_;
865
866 # if filename_full_path is empty, return it as such even on windows, as that's what happens on linux and anything else breaks PDF bg image assoc files on Windows.
867 return $filename_full_path if($filename_full_path eq "");
868
869 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
870 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
871 # to its long (Windows) version
872 # GetLongPathName doesn't work if the file doesn't exist - in this case, use the directory instead
873
874 if (-e $filename_full_path) {
875 my $long_filename = Win32::GetLongPathName($filename_full_path);
876 if (defined $long_filename) {
877
878 $filename_full_path = $long_filename;
879 }
880 } else {
881
882 my ($tailname, $dirname, $suffix)
883 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
884 my $long_dirname = Win32::GetLongPathName($dirname);
885 if (defined $long_dirname) {
886 $filename_full_path = &FileUtils::filenameConcatenate($long_dirname, "$tailname$suffix");
887 }
888 }
889
890 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
891 $filename_full_path =~ s/^(.):/\u$1:/;
892
893 if ((defined $and_encode) && ($and_encode)) {
894 $filename_full_path = encode("utf8",$filename_full_path);
895 }
896 }
897
898 return $filename_full_path;
899}
900
901sub downgrade_if_dos_filename
902{
903 my ($filename_full_path) = @_;
904
905 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
906 require Win32;
907
908 # Ensure the given long Windows filename is in a form that can
909 # be opened by Perl => convert it to a short DOS-like filename
910 # GetShortPathName doesn't work if the file doesn't exist - in this case, use the directory instead
911 if (-e $filename_full_path) {
912 my $short_filename = Win32::GetShortPathName($filename_full_path);
913 if (defined $short_filename) {
914 $filename_full_path = $short_filename;
915 }
916 } else {
917 my ($tailname, $dirname, $suffix)
918 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
919 my $short_dirname = Win32::GetShortPathName($dirname);
920 if (defined $short_dirname) {
921 $filename_full_path = &FileUtils::filenameConcatenate($short_dirname, "$tailname$suffix");
922 }
923
924 }
925 # Make sure initial drive letter is lower-case (to fit in
926 # with rest of Greenstone)
927 $filename_full_path =~ s/^(.):/\u$1:/;
928 }
929
930 return $filename_full_path;
931}
932
933
934sub filename_is_absolute
935{
936 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
937 return &FileUtils::isFilenameAbsolute(@_);
938}
939
940
941## @method make_absolute()
942#
943# Ensure the given file path is absolute in respect to the given base path.
944#
945# @param $base_dir A string denoting the base path the given dir must be
946# absolute to.
947# @param $dir The directory to be made absolute as a string. Note that the
948# dir may already be absolute, in which case it will remain
949# unchanged.
950# @return The now absolute form of the directory as a string.
951#
952# @author John Thompson, DL Consulting Ltd.
953# @copy 2006 DL Consulting Ltd.
954#
955#used in buildcol.pl, doesn't work for all cases --kjdon
956sub make_absolute {
957
958 my ($base_dir, $dir) = @_;
959 ### print STDERR "dir = $dir\n";
960 $dir =~ s/[\\\/]+/\//g;
961 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
962 $dir =~ s|^/tmp_mnt||;
963 1 while($dir =~ s|/[^/]*/\.\./|/|g);
964 $dir =~ s|/[.][.]?/|/|g;
965 $dir =~ tr|/|/|s;
966 ### print STDERR "dir = $dir\n";
967
968 return $dir;
969}
970## make_absolute() ##
971
972sub get_dirsep {
973
974 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
975 return "\\";
976 } else {
977 return "\/";
978 }
979}
980
981sub get_os_dirsep {
982
983 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
984 return "\\\\";
985 } else {
986 return "\\\/";
987 }
988}
989
990sub get_re_dirsep {
991
992 return "\\\\|\\\/";
993}
994
995
996sub get_dirsep_tail {
997 my ($filename) = @_;
998
999 # returns last part of directory or filename
1000 # On unix e.g. a/b.d => b.d
1001 # a/b/c => c
1002
1003 my $dirsep = get_re_dirsep();
1004 my @dirs = split (/$dirsep/, $filename);
1005 my $tail = pop @dirs;
1006
1007 # - caused problems under windows
1008 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1009
1010 return $tail;
1011}
1012
1013
1014# if this is running on windows we want binaries to end in
1015# .exe, otherwise they don't have to end in any extension
1016sub get_os_exe {
1017 return ".exe" if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"));
1018 return "";
1019}
1020
1021
1022# test to see whether this is a big or little endian machine
1023sub is_little_endian
1024{
1025 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1026 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1027 # Otherwise, it's little endian
1028
1029 #return 0 if $^O =~ /^darwin$/i;
1030 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1031
1032 # Going back to stating exactly whether the machine is little endian
1033 # or big endian, without any special case for Macs. Since for rata it comes
1034 # back with little endian and for shuttle with bigendian.
1035 return (ord(substr(pack("s",1), 0, 1)) == 1);
1036}
1037
1038
1039# will return the collection name if successful, "" otherwise
1040sub use_collection {
1041 my ($collection, $collectdir) = @_;
1042
1043 if (!defined $collectdir || $collectdir eq "") {
1044 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
1045 }
1046
1047 if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
1048 $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
1049 }
1050
1051 # get and check the collection
1052 if (!defined($collection) || $collection eq "") {
1053 if (defined $ENV{'GSDLCOLLECTION'}) {
1054 $collection = $ENV{'GSDLCOLLECTION'};
1055 } else {
1056 print STDOUT "No collection specified\n";
1057 return "";
1058 }
1059 }
1060
1061 if ($collection eq "modelcol") {
1062 print STDOUT "You can't use modelcol.\n";
1063 return "";
1064 }
1065
1066 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1067 # are defined
1068 $ENV{'GSDLCOLLECTION'} = $collection;
1069 $ENV{'GSDLCOLLECTHOME'} = $collectdir;
1070 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
1071
1072 # make sure this collection exists
1073 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1074 print STDOUT "Invalid collection ($collection).\n";
1075 return "";
1076 }
1077
1078 # everything is ready to go
1079 return $collection;
1080}
1081
1082sub get_current_collection_name {
1083 return $ENV{'GSDLCOLLECTION'};
1084}
1085
1086
1087# will return the collection name if successful, "" otherwise.
1088# Like use_collection (above) but for greenstone 3 (taking account of site level)
1089
1090sub use_site_collection {
1091 my ($site, $collection, $collectdir) = @_;
1092
1093 if (!defined $collectdir || $collectdir eq "") {
1094 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1095 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1096 }
1097
1098 if (defined $ENV{'GSDL3HOME'}) {
1099 $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};
1100 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
1101 } elsif (defined $ENV{'GSDL3SRCHOME'}) {
1102 $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web");
1103 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
1104 } else {
1105 print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
1106 }
1107
1108 # collectdir explicitly set by this point (using $site variable if required).
1109 # Can call "old" gsdl2 use_collection now.
1110
1111 return use_collection($collection,$collectdir);
1112}
1113
1114
1115
1116sub locate_config_file
1117{
1118 my ($file) = @_;
1119
1120 my $locations = locate_config_files($file);
1121
1122 return shift @$locations; # returns undef if 'locations' is empty
1123}
1124
1125
1126sub locate_config_files
1127{
1128 my ($file) = @_;
1129
1130 my @locations = ();
1131
1132 if (-e $file) {
1133 # Clearly specified (most likely full filename)
1134 # No need to hunt in 'etc' directories, return value unchanged
1135 push(@locations,$file);
1136 }
1137 else {
1138 # Check for collection specific one before looking in global GSDL 'etc'
1139 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1140 my $test_collect_etc_filename
1141 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1142
1143 if (-e $test_collect_etc_filename) {
1144 push(@locations,$test_collect_etc_filename);
1145 }
1146 }
1147 my $test_main_etc_filename
1148 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
1149 if (-e $test_main_etc_filename) {
1150 push(@locations,$test_main_etc_filename);
1151 }
1152 }
1153
1154 return \@locations;
1155}
1156
1157
1158sub hyperlink_text
1159{
1160 my ($text) = @_;
1161
1162 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1163 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1164
1165 return $text;
1166}
1167
1168
1169# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1170# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1171sub is_dir_empty {
1172 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
1173 return &FileUtils::isDirectoryEmpty(@_);
1174}
1175
1176# Returns the given filename converted using either URL encoding or base64
1177# encoding, as specified by $rename_method. If the given filename has no suffix
1178# (if it is just the tailname), then $no_suffix should be some defined value.
1179# rename_method can be url, none, base64
1180sub rename_file {
1181 my ($filename, $rename_method, $no_suffix) = @_;
1182
1183 if(!$filename) { # undefined or empty string
1184 return $filename;
1185 }
1186
1187 if (!$rename_method) {
1188 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1189 # Debugging information
1190 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1191 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1192 $rename_method = "url";
1193 } elsif($rename_method eq "none") {
1194 return $filename; # would have already been renamed
1195 }
1196
1197 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1198 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1199 ###$filename =~ s/ /_/g;
1200
1201 my ($tailname,$dirname,$suffix);
1202 if($no_suffix) { # given a tailname, no suffix
1203 ($tailname,$dirname) = File::Basename::fileparse($filename);
1204 }
1205 else {
1206 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1207 }
1208 if (!$suffix) {
1209 $suffix = "";
1210 }
1211 # This breaks GLI matching extracted metadata to files in Enrich panel, as
1212 # original is eg .JPG while gsdlsourcefilename ends up .jpg
1213 # Not sure why it was done in first place...
1214 #else {
1215 #$suffix = lc($suffix);
1216 #}
1217
1218 if ($rename_method eq "url") {
1219 $tailname = &unicode::url_encode($tailname);
1220 }
1221 elsif ($rename_method eq "base64") {
1222 $tailname = &unicode::force_base64_encode($tailname);
1223 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1224 }
1225
1226 $filename = "$tailname$suffix";
1227 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1228
1229 return $filename;
1230}
1231
1232
1233# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1234sub rename_ldb_or_bdb_file {
1235 my ($filename_no_ext) = @_;
1236
1237 my $new_filename = "$filename_no_ext.gdb";
1238 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1239 # try ldb
1240 my $old_filename = "$filename_no_ext.ldb";
1241
1242 if (-f $old_filename) {
1243 print STDERR "Renaming $old_filename to $new_filename\n";
1244 rename ($old_filename, $new_filename)
1245 || print STDERR "Rename failed: $!\n";
1246 return;
1247 }
1248 # try bdb
1249 $old_filename = "$filename_no_ext.bdb";
1250 if (-f $old_filename) {
1251 print STDERR "Renaming $old_filename to $new_filename\n";
1252 rename ($old_filename, $new_filename)
1253 || print STDERR "Rename failed: $!\n";
1254 return;
1255 }
1256}
1257
1258sub os_dir() {
1259
1260 my $gsdlarch = "";
1261 if(defined $ENV{'GSDLARCH'}) {
1262 $gsdlarch = $ENV{'GSDLARCH'};
1263 }
1264 return $ENV{'GSDLOS'}.$gsdlarch;
1265}
1266
1267# returns 1 if this (GS server) is a GS3 installation, returns 0 if it's GS2.
1268sub is_gs3() {
1269 if($ENV{'GSDL3SRCHOME'}) {
1270 return 1;
1271 } else {
1272 return 0;
1273 }
1274}
1275
1276# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1277# By default, /greenstone3 for GS3 or /greenstone for GS2.
1278sub get_greenstone_url_prefix() {
1279 # if already set on a previous occasion, just return that
1280 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1281 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1282
1283 my ($configfile, $urlprefix, $defaultUrlprefix);
1284 my @propertynames = ();
1285
1286 if($ENV{'GSDL3SRCHOME'}) {
1287 $defaultUrlprefix = "/greenstone3";
1288 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1289 push(@propertynames, qw/path\s*\=/);
1290 } else {
1291 $defaultUrlprefix = "/greenstone";
1292 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1293 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1294 }
1295
1296 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1297
1298 if(!$urlprefix) { # no values found for URL prefix, use default values
1299 $urlprefix = $defaultUrlprefix;
1300 } else {
1301 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1302 $urlprefix =~ s/^\///; # remove the starting slash
1303 my @dirs = split(/(\\|\/)/, $urlprefix);
1304 $urlprefix = shift(@dirs);
1305
1306 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1307 $urlprefix = "/$urlprefix";
1308 }
1309 }
1310
1311 # set for the future
1312 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1313 # print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1314 return $urlprefix;
1315}
1316
1317
1318
1319#
1320# The following comes from activate.pl
1321#
1322# Designed to work with a server included with GS.
1323# - For GS2, we derive the URL from the llssite.cfg file.
1324# - For GS3, we ask ant for the library URL. For GS3, we get the local *http* URL
1325# by default, something like http://127.0.0.1:<httpPort>/greenstone3/library).
1326# Pass in $get_public_url=1 to get something like
1327# <default.protocol>://<tomcat.server>:<default.port>/greenstone/library
1328
1329sub get_full_greenstone_url_prefix
1330{
1331 my ($gs_mode, $lib_name, $get_public_url) = @_;
1332
1333 # if already set on a previous occasion, just return that
1334 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1335 return $ENV{'FULL_GREENSTONE_URL_PREFIX'} if($ENV{'FULL_GREENSTONE_URL_PREFIX'});
1336
1337 # set gs_mode if it was not passed in (servercontrol.pm would pass it in, any other callers won't)
1338 $gs_mode = ($ENV{'GSDL3SRCHOME'}) ? "gs3" : "gs2" unless defined $gs_mode;
1339
1340 my $url = undef;
1341
1342 if($gs_mode eq "gs2") {
1343 my $llssite_cfg = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "llssite.cfg");
1344
1345 if(-f $llssite_cfg) {
1346 # check llssite.cfg for line with url property
1347 # for server.exe also need to use portnumber and enterlib properties
1348 # The following file reading section is a candidate to use FileUtils::readUTF8File()
1349 # in place of calling sysread() directly. But only if we can reason we'd be working with UTF8
1350 # Read in the entire contents of the file in one hit
1351 if (!open (FIN, $llssite_cfg)) {
1352 print STDERR "util::get_full_greenstone_url_prefix() failed to open $llssite_cfg ($!)\n";
1353 return undef;
1354 }
1355
1356 my $contents;
1357 sysread(FIN, $contents, -s FIN);
1358 close(FIN);
1359
1360 my @lines = split(/[\n\r]+/, $contents); # split on carriage-returns and/or linefeeds
1361 my $enterlib = "";
1362 my $portnumber = "8282"; # will remain empty (implicit port 80) unless it's specifically been assigned
1363
1364 foreach my $line (@lines) {
1365 if($line =~ m/^url=(.*)$/) {
1366 $url = $1;
1367 } elsif($line =~ m/^enterlib=(.*)$/) {
1368 $enterlib = $1;
1369 } elsif($line =~ m/^portnumber=(.*)$/) {
1370 $portnumber = $1;
1371 }
1372 }
1373
1374 if(!$url) {
1375 return undef;
1376 }
1377 elsif($url eq "URL_pending") { # library is not running
1378 # do not process url=URL_pending in the file, since for server.exe
1379 # this just means the Enter Library button hasn't been pressed yet
1380 $url = undef;
1381 }
1382 else {
1383 # In the case of server.exe, need to do extra work to get the proper URL
1384 # But first, need to know whether we're indeed dealing with server.exe:
1385
1386 # compare the URL's domain to the full URL
1387 # E.g. for http://localhost:8383/greenstone3/cgi-bin, the domain is localhost:8383
1388 my $uri = URI->new( $url );
1389 my $host = $uri->host;
1390 #print STDERR "@@@@@ host: $host\n";
1391 if($url =~ m/https?:\/\/$host(\/)?$/) {
1392 #if($url !~ m/https?:\/\/$host:$portnumber(\/)?/ || $url =~ m/https?:\/\/$host(\/)?$/) {
1393 # (if the URL does not contain the portnumber, OR if the port is implicitly 80 and)
1394 # If the domain with http:// prefix is completely the same as the URL, assume server.exe
1395 # then the actual URL is the result of suffixing the port and enterlib properties in llssite.cfg
1396 $url = $url.":".$portnumber.$enterlib;
1397 } # else, apache web server
1398
1399 }
1400 }
1401 } elsif($gs_mode eq "gs3") {
1402 # Either check build.properties for tomcat.server, tomcat.port and app.name (and default servlet name).
1403 # app.name is stored in app.path by build.xml. Need to move app.name in build.properties from build.xml
1404
1405 # Or, run the new target get-local-http-servlet-url / get-default-servlet-url
1406 # the output can look like:
1407 #
1408 # Buildfile: build.xml
1409 # [echo] os.name: Windows Vista
1410 #
1411 # get-default-servlet-url:
1412 # [echo] http://localhost:8383/greenstone3/library
1413 # BUILD SUCCESSFUL
1414 # Total time: 0 seconds
1415
1416 #my $output = qx/ant get-default-servlet-url/; # backtick operator, to get STDOUT (else 2>&1)
1417 # - see http://stackoverflow.com/questions/799968/whats-the-difference-between-perls-backticks-system-and-exec
1418
1419 # The get-local-http-servlet-url (or get-default-servlet-url) ant target can be run from anywhere by specifying the
1420 # location of GS3's ant build.xml buildfile. Activate.pl can be run from anywhere for GS3
1421 # GSDL3SRCHOME will be set for GS3 by gs3-setup.sh, a step that would have been necessary
1422 # to run the activate.pl script in the first place
1423
1424 # The default is to get-local-http-servlet-url (of the form http://127.0.0.1:<httpPort>/greentone3/library)
1425 my $full_build_xml = &FileUtils::javaFilenameConcatenate($ENV{'GSDL3SRCHOME'},"build.xml");
1426
1427 # my $perl_command = $get_public_url ? "get-default-servlet-url" : "get-local-http-servlet-url";
1428 my $perl_command = $get_public_url ? "get-external-servlet-url" : "get-internal-servlet-url";
1429 $perl_command = "ant -buildfile \"$full_build_xml\" $perl_command";
1430
1431 if (open(PIN, "$perl_command |")) {
1432 while (defined (my $perl_output_line = <PIN>)) {
1433
1434 if($perl_output_line =~ m@(https?):\/\/(\S*)@) { # grab all the non-whitespace chars
1435 $url="$1://".$2; # preserve the http protocol #$url="http://".$1;
1436 }
1437 }
1438 close(PIN);
1439
1440 # url can be undef if tomcat.port could not be determined due to
1441 # user having wrong or conflicting server related vals in build.props
1442 if (defined $url && defined $lib_name && $lib_name ne "") {
1443 # replace the servlet_name portion of the url found, with the given library_name
1444 $url =~ s@/[^/]*$@/$lib_name@;
1445 }
1446 } else {
1447 print STDERR "util::get_full_greenstone_url_prefix() failed to run $perl_command to work out library URL for $gs_mode\n";
1448 }
1449 }
1450
1451 # either the url is still undef or it is now set
1452 #print STDERR "\n@@@@@ final URL:|$url|\n" if $url;
1453 #print STDERR "\n@@@@@ URL still undef\n" if !$url;
1454
1455 $ENV{'FULL_GREENSTONE_URL_PREFIX'} = $url;
1456
1457 return $url;
1458}
1459
1460
1461# Given a config file (xml or java properties file) and a list/array of regular expressions
1462# that represent property names to match on, this function will return the value for the 1st
1463# matching property name. If the return value is undefined, no matching property was found.
1464sub extract_propvalue_from_file() {
1465 my ($configfile, $propertynames) = @_;
1466
1467 my $value;
1468 unless(open(FIN, "<$configfile")) {
1469 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1470 return $value; # not initialised
1471 }
1472
1473 # Read the entire file at once, as one single line, then close it
1474 my $filecontents;
1475 {
1476 local $/ = undef;
1477 $filecontents = <FIN>;
1478 }
1479 close(FIN);
1480
1481 foreach my $regex (@$propertynames) {
1482 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1483 if($value) {
1484 $value =~ s/^\"//; # remove any startquotes
1485 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1486 last; # found value for a matching property, break from loop
1487 }
1488 }
1489
1490 return $value;
1491}
1492
1493# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1494# given that perllib is in @INC in order to invoke this subroutine.
1495# Call as follows -- after setting up INC to include perllib and
1496# after setting up GSDLHOME and GSDLOS:
1497#
1498# require util;
1499# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1500#
1501sub setup_greenstone_env() {
1502 my ($GSDLHOME, $GSDLOS) = @_;
1503
1504 #my %env_map = ();
1505 # Get the localised ENV settings of running a localised source setup.bash
1506 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1507 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1508 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1509 if (($GSDLOS =~ m/windows/i) && ($^O ne "cygwin")) {
1510 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1511 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1512 }
1513 if (!open(PIN, "$perl_command |")) {
1514 print STDERR ("Unable to execute command: $perl_command. $!\n");
1515 }
1516
1517 my $lastkey;
1518 while (defined (my $perl_output_line = <PIN>)) {
1519 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1520 if(defined $key) {
1521 #$env_map{$key}=$value;
1522 $ENV{$key}=$value;
1523 $lastkey = $key;
1524 } elsif($lastkey && $perl_output_line !~ m/^\s*$/) {
1525 # there was no equals sign in $perl_output_line, so this
1526 # $perl_output_line may be a spillover from the previous
1527 $ENV{$lastkey} = $ENV{$lastkey}."\n".$perl_output_line;
1528 }
1529 }
1530 close (PIN);
1531
1532 # If any keys in $ENV don't occur in Greenstone's localised env
1533 # (stored in $env_map), delete those entries from $ENV
1534 #foreach $key (keys %ENV) {
1535 # if(!defined $env_map{$key}) {
1536 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{$key}\n";
1537 # delete $ENV{$key}; # del $ENV(key, value) pair
1538 # }
1539 #}
1540 #undef %env_map;
1541}
1542
1543sub get_perl_exec() {
1544 my $perl_exec = $^X; # may return just "perl"
1545
1546 if($ENV{'PERLPATH'}) {
1547 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1548 if (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin")) {
1549 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1550 } else {
1551 $perl_exec = "$ENV{'PERLPATH'}/perl";
1552 }
1553 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1554 # containing the full path to the current perl executable we're using
1555 $perl_exec = $Config{perlpath}; # configured path for perl
1556 if (!-e $perl_exec) { # may not point to location on this machine
1557 $perl_exec = $^X; # may return just "perl"
1558 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1559 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1560 }
1561 }
1562 }
1563
1564 return $perl_exec;
1565}
1566
1567# returns the path to the java command in the JRE included with GS (if any),
1568# quoted to safeguard any spaces in this path, otherwise a simple java
1569# command is returned which assumes and will try for a system java.
1570sub get_java_command {
1571 my $java = "java";
1572 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1573 # after running setup.bat or from GLI which also runs setup.bat
1574 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1575 if(-d $java_bin) {
1576 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1577 $java = "\"".$java."\""; # quoted to preserve spaces in path
1578 }
1579 }
1580 return $java;
1581}
1582
1583
1584# Given the qualified collection name (colgroup/collection),
1585# returns the collection and colgroup parts
1586sub get_collection_parts {
1587 # http://perldoc.perl.org/File/Basename.html
1588 # my($filename, $directories, $suffix) = fileparse($path);
1589 # "$directories contains everything up to and including the last directory separator in the $path
1590 # including the volume (if applicable). The remainder of the $path is the $filename."
1591 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1592
1593 my $qualified_collection = shift(@_);
1594
1595 # Since activate.pl can be launched from the command-line, including by a user,
1596 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1597 # Also allow for the accidental inclusion of multiple slashes
1598 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1599
1600 if(!defined $collection) {
1601 $collection = $colgroup;
1602 $colgroup = "";
1603 }
1604 return ($collection, $colgroup);
1605}
1606
1607# work out the "collectdir/collection" location
1608sub resolve_collection_dir {
1609 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1610
1611 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1612 return $ENV{'GSDLCOLLECTDIR'};
1613 }
1614
1615 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1616
1617 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1618 $collect_dir = &util::get_working_collect_dir($site);
1619 }
1620
1621 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1622}
1623
1624# work out the full path to "collect" of this greenstone 2/3 installation
1625sub get_working_collect_dir {
1626 my ($site) = @_;
1627
1628 if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1629 return $ENV{'GSDLCOLLECTHOME'};
1630 }
1631
1632 if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1633
1634 if (defined $ENV{'GSDL3HOME'}) {
1635 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1636 }
1637 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1638 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1639 }
1640 }
1641
1642 elsif (defined $ENV{'SITEHOME'}) {
1643 return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1644 }
1645
1646 else { # get default collect dir for GS2
1647 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1648 }
1649}
1650
1651sub is_abs_path_any_os {
1652 my ($path) = @_;
1653
1654 # We can have filenames in our DBs that were produced on other OS, so this method exists
1655 # to help identify absolute paths in such cases.
1656
1657 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1658 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1659
1660 return 0;
1661}
1662
1663
1664# This subroutine is for improving portability of Greenstone collections from one OS to another,
1665# to be used to convert absolute paths going into db files into paths with placeholders instead.
1666# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1667# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1668# path with a placeholder to match.
1669# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1670# the path to the current (specific) collection, the path to the general GS collect directory,
1671# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1672# The longest matching prefix will be replaced with the equivalent placeholder:
1673# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1674sub abspath_to_placeholders {
1675 my $path = shift(@_); # path to convert from absolute to one with placeholders
1676 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1677
1678 return $path unless is_abs_path_any_os($path); # path is relative
1679
1680 if ($opt_long_or_short_winfilenames eq "long") {
1681 $path = &util::upgrade_if_dos_filename($path); # will only do something on windows
1682 }
1683
1684 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1685 my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $ENV{'SITEHOME'}, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1686
1687 my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1688 $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1689 $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1690 );
1691 $placeholder_map{$ENV{'SITEHOME'}} = '@SITEHOME@' if defined $ENV{'SITEHOME'};
1692
1693 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1694
1695 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1696 # for windows need to look for matches on short file names too
1697 # matched paths are again to be replaced with the usual placeholders
1698
1699 my $gsdlcollectdir = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'});
1700 my $gsdlcollecthome = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'});
1701 my $sitehome = (defined $ENV{'SITEHOME'}) ? &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) : undef;
1702 my $greenstonehome = &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'});
1703
1704 @gs_paths = ($gsdlcollectdir, $gsdlcollecthome, $sitehome, $greenstonehome); # order matters
1705
1706 %placeholder_map = ($greenstonehome => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1707 $gsdlcollecthome => '@COLLECTHOME@',
1708 $gsdlcollectdir => '@THISCOLLECTPATH@'
1709 );
1710 $placeholder_map{$sitehome} = '@SITEHOME@' if defined $sitehome;
1711
1712 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1713 }
1714
1715 return $path;
1716}
1717
1718sub _abspath_to_placeholders {
1719 my ($path, $gs_paths_ref, $placeholder_map_ref) = @_;
1720
1721 # The sequence of elements in @gs_paths matters
1722 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1723 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1724
1725 foreach my $gs_path (@$gs_paths_ref) {
1726 next if(!defined $gs_path); # site undefined for GS2
1727
1728 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1729
1730 if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1731
1732 my $placeholder = $placeholder_map_ref->{$gs_path}; # get the placeholder to replace the matched path with
1733
1734 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1735 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1736
1737 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
1738 # lowercase file extension, This is needed when shortfilenames are used, as case affects alphetical ordering, which affects diffcol
1739
1740 # Note: the following used be done on all Operating Systems, even though the reason for it seems to be related to Windows
1741 # (which, when in its most basic form, is case-insenstive for its filesystem)
1742 # However, making the file-extension be lower-case cases incremental importing to break on Unix systems
1743
1744 $path =~ s/\.([A-Z]+)$/".".lc($1)/e;
1745 }
1746
1747 last; # done
1748 }
1749 }
1750
1751 return $path;
1752}
1753
1754# Function that does the reverse of the util::abspath_to_placeholders subroutine
1755# Once again, call this with the values returned from util::get_common_gs_paths
1756sub placeholders_to_abspath {
1757 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1758 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1759
1760 return $path if($path !~ m/@/); # path contains no placeholders
1761
1762 # replace placeholders with gs prefixes
1763 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1764 # but listed here from longest to shortest once placeholders are have been resolved
1765
1766 # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1767 my %placeholder_to_gspath_map;
1768 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1769 # always replace placeholders with short file names of the absolute paths on windows?
1770 %placeholder_to_gspath_map = ('@GSDLHOME@' => &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'}),
1771 '@COLLECTHOME@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'}),
1772 '@THISCOLLECTPATH@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'})
1773 );
1774 $placeholder_to_gspath_map{'@SITEHOME@'} = &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) if defined $ENV{'SITEHOME'};
1775 } else {
1776 %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'},
1777 '@SITEHOME@' => $ENV{'SITEHOME'}, # can be undef
1778 '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1779 '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1780 ); # $placeholder_to_gspath_map{'@SITEHOME@'} = $ENV{'SITEHOME'} if defined $ENV{'SITEHOME'};
1781 }
1782
1783 foreach my $placeholder (@placeholders) {
1784 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1785
1786 next if(!defined $gs_path); # sitehome for GS2 is undefined
1787
1788 if($path =~ m/^$placeholder/) {
1789 $path =~ s/^$placeholder/$gs_path/;
1790 last; # done
1791 }
1792 }
1793
1794 return $path;
1795}
1796
1797# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1798# a directory containing sequentially numbered images (and optional matching sequentially numbered .txt files).
1799sub create_itemfile
1800{
1801 my ($output_dir, $convert_basename, $convert_to) = @_;
1802 my $page_num = "";
1803
1804 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1805 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1806 closedir DIR;
1807
1808 # Sort files in the directory by page_num
1809 sub page_number {
1810 my ($dir) = @_;
1811 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i);
1812 # my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1813
1814 $pagenum = 1 unless defined $pagenum;
1815 return $pagenum;
1816 }
1817
1818 # sort the files in the directory in the order of page_num rather than lexically.
1819 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1820
1821 # work out if the numbering of the now sorted image files starts at 0 or not
1822 # by checking the number of the first _image_ file (skipping item files)
1823 my $starts_at_0 = 0;
1824 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1825 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1826 $starts_at_0 = 1;
1827 }
1828
1829 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1830 my $item_fh;
1831 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1832 print $item_fh "<PagedDocument>\n";
1833
1834 # In the past, sub create_itemfile() never output txtfile names into the item file (they were left as empty strings),
1835 # only image file names. Now that PDFBox is being customised for GS with the new GS_PDFToImagesAndText.java class to
1836 # create images of each PDF page and extract text for that page if extractable, we can have matching txt files for
1837 # each img file. So now we can output txt file names if we're working with txt files.
1838 # We just test if a text file exists in the same dir that matches the name of the first image file
1839 # if a matching txt file does not exist, don't output txtfile names into the item file
1840
1841 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($firstfile, "\\.[^\\.]+\$"); # relative filenames so no dirname
1842 my $txtfilename = &FileUtils::filenameConcatenate($output_dir, $tailname . ".txt");
1843 my $hasTxtFile = &FileUtils::fileExists($txtfilename);
1844
1845 # Write out the elements of the item file.
1846 # We could be dealing with 3 types of conversion output formats: txt only (paged_text),
1847 # images only (pagedimg_) and images AND text (pagedimgtxt_).
1848 foreach my $file (@dir_files) {
1849 if ($file !~ /\.item/i) {
1850 $page_num = page_number($file);
1851 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1852
1853 if ($convert_to eq "txt") { # output format is paged_text, which has no images
1854 if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file)
1855 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n";
1856 } # else, some non-txt file ext, skip
1857 }
1858 else { # either pagedimg or pagedimgtxt output mode
1859 if($file !~ /\.txt/i) { # check only img files, skip any matching txt files
1860 if($hasTxtFile) { # if every image has a matching txt file, output txtfile too
1861 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
1862 } else { # when it's pagedimg only, txtfile is empty
1863 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1864 }
1865 }
1866 }
1867 }
1868 }
1869
1870
1871 print $item_fh "</PagedDocument>\n";
1872 &FileUtils::closeFileHandle($item_file, \$item_fh);
1873 return $item_file;
1874}
1875
1876# Returns the first directory in parameter list of directories that exists, undef if none existed
1877sub get_first_existing_dir {
1878 my @directories = @_;
1879
1880 foreach my $directory (@directories) {
1881 if(&FileUtils::directoryExists($directory)) {
1882 return $directory;
1883 }
1884 }
1885 return undef;
1886}
1887
1888
1889# Sets the gnomelib_env. Based on the logic in wvware.pl which can perhaps be replaced with a call to this function in future
1890sub set_gnomelib_env
1891{
1892 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
1893 # Though this is only needed for darwin Lion at this point (and android, though that is untested)
1894
1895 my $libext = "so";
1896 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1897 return;
1898 } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1899 $libext = "dylib";
1900 }
1901
1902 if (!defined $ENV{'GEXTGNOME'}) {
1903 ##print STDERR "@@@ Setting GEXTGNOME env\n";
1904
1905 my $gnome_dir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext","gnome-lib-minimal");
1906
1907 if(! -d $gnome_dir) {
1908 $gnome_dir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext","gnome-lib");
1909
1910 if(! -d $gnome_dir) {
1911 $gnome_dir = "";
1912 }
1913 }
1914
1915 # now set other the related env vars,
1916 # IF we've found the gnome-lib dir installed in the ext folder
1917
1918 if ($gnome_dir ne "" && -f &FileUtils::filenameConcatenate($gnome_dir, $ENV{'GSDLOS'}, "lib", "libiconv.$libext")) {
1919 $ENV{'GEXTGNOME'} = $gnome_dir;
1920 $ENV{'GEXTGNOME_INSTALLED'}=&FileUtils::filenameConcatenate($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
1921
1922 my $gnomelib_bin = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "bin");
1923 if(-d $gnomelib_bin) { # no bin subfolder in GS binary's cutdown gnome-lib-minimal folder
1924 &util::envvar_prepend("PATH", $gnomelib_bin);
1925 }
1926
1927 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
1928 my $gextlib = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "lib");
1929
1930 if($ENV{'GSDLOS'} eq "linux") {
1931 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
1932 }
1933 elsif ($ENV{'GSDLOS'} eq "darwin") {
1934 #&util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
1935 &util::envvar_prepend("DYLD_FALLBACK_LIBRARY_PATH", $gextlib);
1936 }
1937 }
1938
1939 # Above largely mimics the setup.bash of the gnome-lib-minimal.
1940 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
1941 # Not exporting GSDLEXTS variable either
1942 }
1943
1944 # print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
1945 # print STDERR "\tPATH".$ENV{'PATH'}."\n";
1946 # print STDERR "\tLD_LIB_PATH".$ENV{'LD_LIBRARY_PATH'}."\n" if $ENV{'LD_LIBRARY_PATH};
1947 # print STDERR "\tDYLD_FALLBACK_LIB_PATH".$ENV{'DYLD_FALLBACK_LIBRARY_PATH'}."\n" if $ENV{'DYLD_FALLBACK_LIBRARY_PATH};
1948
1949 # if no GEXTGNOME, maybe users didn't need gnome-lib to run gnomelib/libiconv dependent binaries like hashfile, suffix, wget
1950 # (wvware is launched in a gnomelib env from its own script, but could possibly go through this script in future)
1951}
1952
1953
1954
1955## @function augmentINC()
1956#
1957# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1958# @param $new_path The path to add
1959# @author jmt12
1960#
1961sub augmentINC
1962{
1963 my ($new_path) = @_;
1964 my $did_add_path = 0;
1965 # might need to be replaced with FileUtils::directoryExists() call eventually
1966 if (-d $new_path)
1967 {
1968 my $did_find_path = 0;
1969 foreach my $existing_path (@INC)
1970 {
1971 if ($existing_path eq $new_path)
1972 {
1973 $did_find_path = 1;
1974 last;
1975 }
1976 }
1977 if (!$did_find_path)
1978 {
1979 unshift(@INC, $new_path);
1980 $did_add_path = 1;
1981 }
1982 }
1983 return $did_add_path;
1984}
1985## augmentINC()
1986
1987
1988# Useful String utility functions
1989# trim(str) removes whitespace at start and end of string parameter
1990sub trim {
1991 my ($str) = @_;
1992
1993 # trim whitespace at start and end of str
1994 # https://perlmaven.com/trim
1995 $str =~ s/^\s+|\s+$//g;
1996
1997 return $str;
1998}
1999
20001;
Note: See TracBrowser for help on using the repository browser.