source: main/trunk/greenstone2/perllib/util.pm@ 33757

Last change on this file since 33757 was 33757, checked in by ak19, 4 years ago
  1. Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.
  • Property svn:keywords set to Author Date Id Revision
File size: 68.4 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29no strict 'refs'; # make an exception so we can use variables as filehandles
30use FileUtils;
31
32use Encode;
33use Unicode::Normalize 'normalize';
34
35use File::Copy;
36use File::Basename;
37# Config for getting the perlpath in the recommended way, though it uses paths that are
38# hard-coded into the Config file that's generated upon configuring and compiling perl.
39# $^X works better in some cases to return the path to perl used to launch the script,
40# but if launched with plain "perl" (no full-path), that will be just what it returns.
41use Config;
42# New module for file related utility functions - intended as a
43# placeholder for an extension that allows a variety of different
44# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
45use FileUtils;
46
47if ($ENV{'GSDLOS'} =~ /^windows$/i) {
48 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
49}
50
51# removes files (but not directories)
52sub rm {
53 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
54 return &FileUtils::removeFiles(@_);
55}
56
57# recursive removal
58sub filtered_rm_r {
59 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
60 return &FileUtils::removeFilesFiltered(@_);
61}
62
63# recursive removal
64sub rm_r {
65 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
66 return &FileUtils::removeFilesRecursive(@_);
67}
68
69# moves a file or a group of files
70sub mv {
71 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
72 return &FileUtils::moveFiles(@_);
73}
74
75# Move the contents of source directory into target directory
76# (as opposed to merely replacing target dir with the src dir)
77# This can overwrite any files with duplicate names in the target
78# but other files and folders in the target will continue to exist
79sub mv_dir_contents {
80 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
81 return &FileUtils::moveDirectoryContents(@_);
82}
83
84# copies a file or a group of files
85sub cp {
86 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
87 return &FileUtils::copyFiles(@_);
88}
89
90# recursively copies a file or group of files
91# syntax: cp_r (sourcefiles, destination directory)
92# destination must be a directory - to copy one file to
93# another use cp instead
94sub cp_r {
95 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
96 return &FileUtils::copyFilesRecursive(@_);
97}
98
99# recursively copies a file or group of files
100# syntax: cp_r (sourcefiles, destination directory)
101# destination must be a directory - to copy one file to
102# another use cp instead
103sub cp_r_nosvn {
104 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
105 return &FileUtils::copyFilesRecursiveNoSVN(@_);
106}
107
108# copies a directory and its contents, excluding subdirectories, into a new directory
109sub cp_r_toplevel {
110 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
111 return &FileUtils::recursiveCopyTopLevel(@_);
112}
113
114sub mk_dir {
115 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
116 return &FileUtils::makeDirectory(@_);
117}
118
119# in case anyone cares - I did some testing (using perls Benchmark module)
120# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
121# slightly faster (surprisingly) - Stefan.
122sub mk_all_dir {
123 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
124 return &FileUtils::makeAllDirectories(@_);
125}
126
127# make hard link to file if supported by OS, otherwise copy the file
128sub hard_link {
129 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
130 return &FileUtils::hardLink(@_);
131}
132
133# make soft link to file if supported by OS, otherwise copy file
134sub soft_link {
135 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
136 return &FileUtils::softLink(@_);
137}
138
139# Primarily for filenames generated by processing
140# content of HTML files (which are mapped to UTF-8 internally)
141#
142# To turn this into an octet string that really exists on the file
143# system:
144# 1. don't need to do anything special for Unix-based systems
145# (as underlying file system is byte-code)
146# 2. need to map to short DOS filenames for Windows
147
148sub utf8_to_real_filename
149{
150 my ($utf8_filename) = @_;
151
152 my $real_filename;
153
154 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
155 require Win32;
156
157 my $unicode_filename = decode("utf8",$utf8_filename);
158 $real_filename = Win32::GetShortPathName($unicode_filename);
159 }
160 else {
161 $real_filename = $utf8_filename;
162 }
163
164 return $real_filename;
165}
166
167sub raw_filename_to_unicode
168{
169 my ($directory, $raw_file, $filename_encoding ) = @_;
170
171 my $unicode_filename = $raw_file;
172 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
173 # Try turning a short version to the long version
174 # If there are "funny" characters in the file name, that can't be represented in the ANSI code, then we will have a short weird version, eg E74~1.txt
175 $unicode_filename = &util::get_dirsep_tail(&util::upgrade_if_dos_filename(&FileUtils::filenameConcatenate($directory, $raw_file), 0));
176
177
178 if ($unicode_filename eq $raw_file) {
179 # This means the original filename *was* able to be encoded in the local ANSI file encoding (eg windows_1252), so now we turn it back to perl's unicode
180
181 $unicode_filename = &Encode::decode(locale_fs => $unicode_filename);
182 }
183 # else This means we did have one of the funny filenames. the getLongPathName (used in upgrade_if_dos_filename) will return unicode, so we don't need to do anything more.
184
185
186 } else {
187 # we had a utf-8 string, turn it into perl internal unicode
188 $unicode_filename = &Encode::decode("utf-8", $unicode_filename);
189
190
191 }
192 #Does the filename have url encoded chars in it?
193 if (&unicode::is_url_encoded($unicode_filename)) {
194 $unicode_filename = &unicode::url_decode($unicode_filename);
195 }
196
197 # Normalise the filename to canonical composition - on mac, filenames use decopmposed form for accented chars
198 if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
199 $unicode_filename = normalize('C', $unicode_filename); # Composed form 'C'
200 }
201 return $unicode_filename;
202
203}
204sub fd_exists {
205 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
206 return &FileUtils::fileTest(@_);
207}
208
209sub file_exists {
210 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
211 return &FileUtils::fileExists(@_);
212}
213
214sub dir_exists {
215 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
216 return &FileUtils::directoryExists(@_);
217}
218
219# updates a copy of a directory in some other part of the filesystem
220# verbosity settings are: 0=low, 1=normal, 2=high
221# both $fromdir and $todir should be absolute paths
222sub cachedir {
223 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
224 return &FileUtils::synchronizeDirectories(@_);
225}
226
227# this function returns -1 if either file is not found
228# assumes that $file1 and $file2 are absolute file names or
229# in the current directory
230# $file2 is allowed to be newer than $file1
231sub differentfiles {
232 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
233 return &FileUtils::differentFiles(@_);
234}
235
236
237# works out the temporary directory, including in the case where Greenstone is not writable
238# In that case, gs3-setup.bat would already have set the GS_TMP_OUTPUT_DIR temp variable
239sub determine_tmp_dir
240{
241 my $try_collect_dir = shift(@_) || 0;
242
243 my $tmp_dirname;
244 if(defined $ENV{'GS_TMP_OUTPUT_DIR'}) {
245 $tmp_dirname = $ENV{'GS_TMP_OUTPUT_DIR'};
246 } elsif($try_collect_dir && defined $ENV{'GSDLCOLLECTDIR'}) {
247 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
248 } elsif(defined $ENV{'GSDLHOME'}) {
249 $tmp_dirname = $ENV{'GSDLHOME'};
250 } else {
251 return undef;
252 }
253
254 if(!defined $ENV{'GS_TMP_OUTPUT_DIR'}) {
255 # test the tmp_dirname folder is writable, by trying to write out a file
256 # Unfortunately, cound not get if(-w $dirname) to work on directories on Windows
257 ## http://alvinalexander.com/blog/post/perl/perl-file-test-operators-reference-cheat-sheet (test file/dir writable)
258 ## http://www.makelinux.net/alp/083 (real and effective user IDs)
259
260 my $tmp_test_file = &FileUtils::filenameConcatenate($tmp_dirname, "writability_test.tmp");
261 if (open (FOUT, ">$tmp_test_file")) {
262 close(FOUT);
263 &FileUtils::removeFiles($tmp_test_file);
264 } else { # location not writable, use TMP location
265 if (defined $ENV{'TMP'}) {
266 $tmp_dirname = $ENV{'TMP'};
267 } else {
268 $tmp_dirname = "/tmp";
269 }
270 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "greenstone");
271 $ENV{'GS_TMP_OUTPUT_DIR'} = $tmp_dirname; # store for next time
272 }
273 }
274
275 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
276 &FileUtils::makeAllDirectories ($tmp_dirname) unless -e $tmp_dirname;
277
278 return $tmp_dirname;
279}
280
281sub get_tmp_filename
282{
283 my $file_ext = shift(@_) || undef;
284
285 my $opt_dot_file_ext = "";
286 if (defined $file_ext) {
287 if ($file_ext !~ m/\./) {
288 # no dot, so needs one added in at start
289 $opt_dot_file_ext = ".$file_ext"
290 }
291 else {
292 # allow for "extensions" such as _metadata.txt to be handled
293 # gracefully
294 $opt_dot_file_ext = $file_ext;
295 }
296 }
297
298 my $tmpdir = &util::determine_tmp_dir(0);
299
300 my $count = 1000;
301 my $rand = int(rand $count);
302 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
303
304 while (-e $full_tmp_filename) {
305 $rand = int(rand $count);
306 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
307 $count++;
308 }
309
310 return $full_tmp_filename;
311}
312
313# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
314# belong with that function. They help ensure the timestamped tmp folders generated are unique.
315my $previous_timestamp = undef;
316my $previous_timestamp_f = 0; # frequency
317
318sub get_timestamped_tmp_folder
319{
320 my $tmp_dirname = &util::determine_tmp_dir(1);
321
322 # add the timestamp into the path otherwise we can run into problems
323 # if documents have the same name
324 my $timestamp = time;
325
326 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
327 $previous_timestamp_f = 0;
328 $previous_timestamp = $timestamp;
329 } else {
330 $previous_timestamp_f++;
331 }
332
333 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
334 $tmp_dirname = $time_tmp_dirname;
335 my $i = $previous_timestamp_f;
336
337 if($previous_timestamp_f > 0) {
338 $tmp_dirname = $time_tmp_dirname."_".$i;
339 $i++;
340 }
341 while (-e $tmp_dirname) {
342 $tmp_dirname = $time_tmp_dirname."_".$i;
343 $i++;
344 }
345 &FileUtils::makeDirectory($tmp_dirname);
346
347 return $tmp_dirname;
348}
349
350sub get_timestamped_tmp_filename_in_collection
351{
352
353 my ($input_filename, $output_ext) = @_;
354 # derive tmp filename from input filename
355 my ($tailname, $dirname, $suffix)
356 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
357
358 # softlink to collection tmp dir
359 my $tmp_dirname = &util::get_timestamped_tmp_folder();
360 $tmp_dirname = $dirname unless defined $tmp_dirname;
361
362 # following two steps copied from ConvertBinaryFile
363 # do we need them?? can't use them as is, as they use plugin methods.
364
365 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
366
367 # URLEncode this since htmls with images where the html filename is utf8 don't seem
368 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
369 # files on the filesystem.
370 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
371 if (defined $output_ext) {
372 $output_ext = ".$output_ext"; # add the dot
373 } else {
374 $output_ext = $suffix;
375 }
376 $output_ext= lc($output_ext);
377 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
378
379 return $tmp_filename;
380}
381
382sub get_toplevel_tmp_dir
383{
384 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
385}
386
387
388sub get_collectlevel_tmp_dir
389{
390 my $tmp_dirname = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp");
391 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
392
393 return $tmp_dirname;
394}
395
396sub get_parent_folder
397{
398 my ($path) = @_;
399 my ($tailname, $dirname, $suffix)
400 = &File::Basename::fileparse($path, "\\.[^\\.]+\$");
401
402 return &FileUtils::sanitizePath($dirname);
403}
404
405sub filename_to_regex {
406 my $filename = shift (@_);
407
408 # need to make single backslashes double so that regex works
409 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
410
411 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
412 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
413 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
414 $filename =~ s@\(@\\(@g; # escape brackets
415 $filename =~ s@\)@\\)@g; # escape brackets
416 $filename =~ s@\[@\\[@g; # escape brackets
417 $filename =~ s@\]@\\]@g; # escape brackets
418
419 return $filename;
420}
421
422sub unregex_filename {
423 my $filename = shift (@_);
424
425 # need to put doubled backslashes for regex back to single
426 $filename =~ s/\\\./\./g; # remove RE syntax for .
427 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
428 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
429 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
430 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
431
432 # \\ goes to \
433 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
434 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
435 return $filename;
436}
437
438sub filename_cat {
439 # I've disabled this warning for now, as every Greenstone perl
440 # script seems to make use of this function and so you drown in a
441 # sea of deprecated warnings [jmt12]
442 # warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
443 return &FileUtils::filenameConcatenate(@_);
444}
445
446
447sub _pathname_cat {
448 my $join_char = shift(@_);
449 my $first_path = shift(@_);
450 my (@pathnames) = @_;
451
452 # If first_path is not null or empty, then add it back into the list
453 if (defined $first_path && $first_path =~ /\S/) {
454 unshift(@pathnames, $first_path);
455 }
456
457 my $pathname = join($join_char, @pathnames);
458
459 # remove duplicate slashes
460 if ($join_char eq ";") {
461 $pathname =~ s/[\\\/]+/\\/g;
462 if ($^O eq "cygwin") {
463 # Once we've collapsed muliple (potentialy > 2) slashes
464 # For cygwin, actually want things double-backslahed
465 $pathname =~ s/\\/\\\\/g;
466 }
467
468 } else {
469 $pathname =~ s/[\/]+/\//g;
470 # DB: want a pathname abc\de.html to remain like this
471 }
472
473 return $pathname;
474}
475
476
477sub pathname_cat {
478 my (@pathnames) = @_;
479
480 my $join_char;
481 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
482 $join_char = ";";
483 } else {
484 $join_char = ":";
485 }
486 return _pathname_cat($join_char,@pathnames);
487}
488
489
490sub javapathname_cat {
491 my (@pathnames) = @_;
492
493 my $join_char;
494
495 # Unlike pathname_cat() above, not interested if running in a Cygwin environment
496 # This is because the java we run is actually a native Windows executable
497
498 if (($ENV{'GSDLOS'} =~ /^windows$/i)) {
499 $join_char = ";";
500 } else {
501 $join_char = ":";
502 }
503 return _pathname_cat($join_char,@pathnames);
504}
505
506
507sub makeFilenameJavaCygwinCompatible
508{
509 my ($java_filename) = @_;
510
511 if ($^O eq "cygwin") {
512 # To be used with a Java program, but under Cygwin
513 # Because the java binary that is native to Windows, need to
514 # convert the Cygwin paths (i.e. Unix style) to be Windows
515 # compatible
516
517 $java_filename = `cygpath -wp "$java_filename"`;
518 chomp($java_filename);
519 $java_filename =~ s%\\%\\\\%g;
520 }
521
522 return $java_filename;
523}
524
525sub tidy_up_oid {
526 my ($OID) = @_;
527 if ($OID =~ /[\.\/\\]/) {
528 print STDERR "Warning, identifier $OID contains periods or slashes(.\\\/), replacing them with _\n";
529 $OID =~ s/[\.\\\/]/_/g; #remove any periods
530 }
531 if ($OID =~ /^\s.*\s$/) {
532 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
533 # remove starting and trailing whitespace
534 $OID =~ s/^\s+//;
535 $OID =~ s/\s+$//;
536 }
537 if ($OID =~ /^[\d]*$/) {
538 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
539 $OID = "D" . $OID;
540 }
541
542 return $OID;
543}
544
545sub envvar_prepend {
546 my ($var,$val) = @_;
547
548 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
549 ## my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
550
551 # Rewritten above to make ":" the default (Windows is the special
552 # case, anything else 'unusual' such as Solaris etc is Unix)
553 my $pathsep = (defined $ENV{'GSDLOS'} && (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin"))) ? ";" : ":";
554
555 # do not prepend any value/path that's already in the environment variable
556
557 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
558 if (!defined($ENV{$var})) {
559 $ENV{$var} = "$val";
560 }
561 elsif($ENV{$var} !~ m/$escaped_val/) {
562 $ENV{$var} = "$val".$pathsep.$ENV{$var};
563 }
564}
565
566sub envvar_append {
567 my ($var,$val) = @_;
568
569 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
570 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
571
572 # do not append any value/path that's already in the environment variable
573
574 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
575 if (!defined($ENV{$var})) {
576 $ENV{$var} = "$val";
577 }
578 elsif($ENV{$var} !~ m/$escaped_val/) {
579 $ENV{$var} = $ENV{$var}.$pathsep."$val";
580 }
581}
582
583# debug aid
584sub print_env {
585 my ($handle, @envvars) = @_; # print to $handle, which can be STDERR/STDOUT/file, etc.
586
587 if (scalar(@envvars) == 0) {
588 #print $handle "@@@ All env vars requested\n";
589
590 my $output = "";
591
592 print $handle "@@@ Environment was:\n********\n";
593 foreach my $envvar (sort keys(%ENV)) {
594 if(defined $ENV{$envvar}) {
595 print $handle "\t$envvar = $ENV{$envvar}\n";
596 } else {
597 print $handle "\t$envvar = \n";
598 }
599 }
600 print $handle "********\n";
601 } else {
602 print $handle "@@@ Environment was:\n********\n";
603 foreach my $envvar (@envvars) {
604 if(defined $ENV{$envvar}) {
605 print $handle "\t$envvar = ".$ENV{$envvar}."\n";
606 } else {
607 print $handle "Env var '$envvar' was not set\n";
608 }
609 }
610 print $handle "********\n";
611 }
612}
613
614
615# splits a filename into a prefix and a tail extension using the tail_re, or
616# if that fails, splits on the file_extension . (dot)
617sub get_prefix_and_tail_by_regex {
618
619 my ($filename,$tail_re) = @_;
620
621 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
622 if ((!defined $file_prefix) || (!defined $file_ext)) {
623 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
624 }
625
626 return ($file_prefix,$file_ext);
627}
628
629# get full path and file only path from a base_dir (which may be empty) and
630# file (which may contain directories)
631sub get_full_filenames {
632 my ($base_dir, $file) = @_;
633
634 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0);
635 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
636 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
637
638
639 my $filename_full_path = $file;
640 # add on directory if present
641 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
642
643 my $filename_no_path = $file;
644
645 # remove directory if present
646 $filename_no_path =~ s/^.*[\/\\]//;
647 return ($filename_full_path, $filename_no_path);
648}
649
650# returns the path of a file without the filename -- ie. the directory the file is in
651sub filename_head {
652 my $filename = shift(@_);
653
654 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
655 $filename =~ s/[^\\\\]*$//;
656 }
657 else {
658 $filename =~ s/[^\\\/]*$//;
659 }
660
661 return $filename;
662}
663
664# Debug function to print the caller at the provided depth or else depth=1 (to skip the function
665# that called this one, which is at depth 0).
666sub debug_print_caller {
667 my $depth = shift(@_);
668 $depth = 1 unless $depth; # start at 1 to skip printing the function that called this one
669 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller($depth);
670 my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
671 print STDERR "** Calling method at depth $depth: $lcfilename:$cline $cpackage->$csubr\n";
672}
673
674# Debug function to print the call stack.
675# Optional param maxdepth: how many callers up the stack to print, *besides* this function's own
676# caller. If maxdepth parameter unspecified, prints the entire call stack.
677sub debug_print_call_stack {
678 my $maxdepth = shift(@_);
679 if($maxdepth) {
680 print STDERR "** CALL STACK UP TO AND INCL. MAX DEPTH OF $maxdepth:\n";
681 } else {
682 print STDERR "** FULL CALL STACK:\n";
683 }
684
685 my $depth = 0; # start by just printing this sub's calling function too
686 while(1) {
687 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller($depth);
688 last unless (defined $cfilename && defined $cline && defined $cpackage); # when call stack printed in full
689 my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
690 print STDERR "\t$lcfilename:$cline $cpackage->$csubr\n";
691 $depth++;
692 # print out caller at $maxdepth too, even though $depth starts at 0
693 # So this method prints out maxdepth+1 callers
694 last if($maxdepth && $depth > $maxdepth);
695 }
696 return "";
697}
698
699
700# returns 1 if filename1 and filename2 point to the same
701# file or directory
702sub filenames_equal {
703 my ($filename1, $filename2) = @_;
704
705 # use filename_cat to clean up trailing slashes and
706 # multiple slashes
707 $filename1 = &FileUtils::filenameConcatenate($filename1);
708 $filename2 = &FileUtils::filenameConcatenate($filename2);
709
710 # filenames not case sensitive on windows
711 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
712 $filename1 =~ tr/[A-Z]/[a-z]/;
713 $filename2 =~ tr/[A-Z]/[a-z]/;
714 }
715 return 1 if $filename1 eq $filename2;
716 return 0;
717}
718
719# If filename is relative to within_dir, returns the relative path of filename to that directory
720# with slashes in the filename returned as they were in the original (absolute) filename.
721sub filename_within_directory
722{
723 my ($filename,$within_dir) = @_;
724
725 if ($within_dir !~ m/[\/\\]$/) {
726 my $dirsep = &util::get_dirsep();
727 $within_dir .= $dirsep;
728 }
729
730 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
731 if ($filename =~ m/^$within_dir(.*)$/) {
732 $filename = $1;
733 }
734
735 return $filename;
736}
737
738# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
739# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
740# The subpath returned will also be a URL type filename.
741sub filename_within_directory_url_format
742{
743 my ($filename,$within_dir) = @_;
744
745 # convert parameters only to / slashes if Windows
746
747 my $filename_urlformat = &filepath_to_url_format($filename);
748 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
749
750 #if ($within_dir_urlformat !~ m/\/$/) {
751 # make sure directory ends with a slash
752 #$within_dir_urlformat .= "/";
753 #}
754
755 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
756
757 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
758
759 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
760 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
761 $filename_urlformat = $1;
762 }
763
764 return $filename_urlformat;
765}
766
767# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
768# since on Linux it doesn't represent a file separator but an escape char).
769sub filepath_to_url_format
770{
771 my ($filepath) = @_;
772 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
773 # Only need to worry about Windows, as Unix style directories already in url-format
774 # Convert Windows style \ => /
775 $filepath =~ s@\\@/@g;
776 }
777 return $filepath;
778}
779
780# regex filepaths on windows may include \\ as path separator. Convert \\ to /
781sub filepath_regex_to_url_format
782{
783 my ($filepath) = @_;
784 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
785 # Only need to worry about Windows, as Unix style directories already in url-format
786 # Convert Windows style \\ => /
787 $filepath =~ s@\\\\@/@g;
788 }
789 return $filepath;
790
791}
792
793# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
794# and ignores trailing /
795# returns (file, dirs) dirs will be empty if no subdirs
796sub url_fileparse
797{
798 my ($filepath) = @_;
799 # remove trailing /
800 $filepath =~ s@/$@@;
801 if ($filepath !~ m@/@) {
802 return ($filepath, "");
803 }
804 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
805 return ($file, $dirs);
806
807}
808
809
810sub filename_within_collection
811{
812 my ($filename) = @_;
813
814 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
815
816 if (defined $collect_dir) {
817
818 # if from within GSDLCOLLECTDIR, then remove directory prefix
819 # so source_filename is realative to it. This is done to aid
820 # portability, i.e. the collection can be moved to somewhere
821 # else on the file system and the archives directory will still
822 # work. This is needed, for example in the applet version of
823 # GLI where GSDLHOME/collect on the server will be different to
824 # the collect directory of the remove user. Of course,
825 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
826 # it back into a full pathname.
827
828 $filename = filename_within_directory($filename,$collect_dir);
829 }
830
831 return $filename;
832}
833
834sub prettyprint_file
835{
836 my ($base_dir,$file,$gli) = @_;
837
838 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
839
840 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
841 require Win32;
842
843 # For some reason base_dir in the form c:/a/b/c
844 # This leads to confusion later on, so turn it back into
845 # the more usual Windows form
846 $base_dir =~ s/\//\\/g;
847 my $long_base_dir = Win32::GetLongPathName($base_dir);
848 my $long_full_path = Win32::GetLongPathName($filename_full_path);
849
850 $file = filename_within_directory($long_full_path,$long_base_dir);
851 $file = encode("utf8",$file) if ($gli);
852 }
853
854 return $file;
855}
856
857
858sub upgrade_if_dos_filename
859{
860 my ($filename_full_path,$and_encode) = @_;
861
862 # if filename_full_path is empty, return it as such even on windows, as that's what happens on linux and anything else breaks PDF bg image assoc files on Windows.
863 return $filename_full_path if($filename_full_path eq "");
864
865 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
866 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
867 # to its long (Windows) version
868 # GetLongPathName doesn't work if the file doesn't exist - in this case, use the directory instead
869
870 if (-e $filename_full_path) {
871 my $long_filename = Win32::GetLongPathName($filename_full_path);
872 if (defined $long_filename) {
873
874 $filename_full_path = $long_filename;
875 }
876 } else {
877
878 my ($tailname, $dirname, $suffix)
879 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
880 my $long_dirname = Win32::GetLongPathName($dirname);
881 if (defined $long_dirname) {
882 $filename_full_path = &FileUtils::filenameConcatenate($long_dirname, "$tailname$suffix");
883 }
884 }
885
886 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
887 $filename_full_path =~ s/^(.):/\u$1:/;
888
889 if ((defined $and_encode) && ($and_encode)) {
890 $filename_full_path = encode("utf8",$filename_full_path);
891 }
892 }
893
894 return $filename_full_path;
895}
896
897sub downgrade_if_dos_filename
898{
899 my ($filename_full_path) = @_;
900
901 if (($ENV{'GSDLOS'} =~ m/^windows$/i) && ($^O ne "cygwin")) {
902 require Win32;
903
904 # Ensure the given long Windows filename is in a form that can
905 # be opened by Perl => convert it to a short DOS-like filename
906 # GetShortPathName doesn't work if the file doesn't exist - in this case, use the directory instead
907 if (-e $filename_full_path) {
908 my $short_filename = Win32::GetShortPathName($filename_full_path);
909 if (defined $short_filename) {
910 $filename_full_path = $short_filename;
911 }
912 } else {
913 my ($tailname, $dirname, $suffix)
914 = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
915 my $short_dirname = Win32::GetShortPathName($dirname);
916 if (defined $short_dirname) {
917 $filename_full_path = &FileUtils::filenameConcatenate($short_dirname, "$tailname$suffix");
918 }
919
920 }
921 # Make sure initial drive letter is lower-case (to fit in
922 # with rest of Greenstone)
923 $filename_full_path =~ s/^(.):/\u$1:/;
924 }
925
926 return $filename_full_path;
927}
928
929
930sub filename_is_absolute
931{
932 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
933 return &FileUtils::isFilenameAbsolute(@_);
934}
935
936
937## @method make_absolute()
938#
939# Ensure the given file path is absolute in respect to the given base path.
940#
941# @param $base_dir A string denoting the base path the given dir must be
942# absolute to.
943# @param $dir The directory to be made absolute as a string. Note that the
944# dir may already be absolute, in which case it will remain
945# unchanged.
946# @return The now absolute form of the directory as a string.
947#
948# @author John Thompson, DL Consulting Ltd.
949# @copy 2006 DL Consulting Ltd.
950#
951#used in buildcol.pl, doesn't work for all cases --kjdon
952sub make_absolute {
953
954 my ($base_dir, $dir) = @_;
955 ### print STDERR "dir = $dir\n";
956 $dir =~ s/[\\\/]+/\//g;
957 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
958 $dir =~ s|^/tmp_mnt||;
959 1 while($dir =~ s|/[^/]*/\.\./|/|g);
960 $dir =~ s|/[.][.]?/|/|g;
961 $dir =~ tr|/|/|s;
962 ### print STDERR "dir = $dir\n";
963
964 return $dir;
965}
966## make_absolute() ##
967
968sub get_dirsep {
969
970 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
971 return "\\";
972 } else {
973 return "\/";
974 }
975}
976
977sub get_os_dirsep {
978
979 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
980 return "\\\\";
981 } else {
982 return "\\\/";
983 }
984}
985
986sub get_re_dirsep {
987
988 return "\\\\|\\\/";
989}
990
991
992sub get_dirsep_tail {
993 my ($filename) = @_;
994
995 # returns last part of directory or filename
996 # On unix e.g. a/b.d => b.d
997 # a/b/c => c
998
999 my $dirsep = get_re_dirsep();
1000 my @dirs = split (/$dirsep/, $filename);
1001 my $tail = pop @dirs;
1002
1003 # - caused problems under windows
1004 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
1005
1006 return $tail;
1007}
1008
1009
1010# if this is running on windows we want binaries to end in
1011# .exe, otherwise they don't have to end in any extension
1012sub get_os_exe {
1013 return ".exe" if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"));
1014 return "";
1015}
1016
1017
1018# test to see whether this is a big or little endian machine
1019sub is_little_endian
1020{
1021 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1022 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1023 # Otherwise, it's little endian
1024
1025 #return 0 if $^O =~ /^darwin$/i;
1026 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1027
1028 # Going back to stating exactly whether the machine is little endian
1029 # or big endian, without any special case for Macs. Since for rata it comes
1030 # back with little endian and for shuttle with bigendian.
1031 return (ord(substr(pack("s",1), 0, 1)) == 1);
1032}
1033
1034
1035# will return the collection name if successful, "" otherwise
1036sub use_collection {
1037 my ($collection, $collectdir) = @_;
1038
1039 if (!defined $collectdir || $collectdir eq "") {
1040 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
1041 }
1042
1043 if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
1044 $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
1045 }
1046
1047 # get and check the collection
1048 if (!defined($collection) || $collection eq "") {
1049 if (defined $ENV{'GSDLCOLLECTION'}) {
1050 $collection = $ENV{'GSDLCOLLECTION'};
1051 } else {
1052 print STDOUT "No collection specified\n";
1053 return "";
1054 }
1055 }
1056
1057 if ($collection eq "modelcol") {
1058 print STDOUT "You can't use modelcol.\n";
1059 return "";
1060 }
1061
1062 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1063 # are defined
1064 $ENV{'GSDLCOLLECTION'} = $collection;
1065 $ENV{'GSDLCOLLECTHOME'} = $collectdir;
1066 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
1067
1068 # make sure this collection exists
1069 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1070 print STDOUT "Invalid collection ($collection).\n";
1071 return "";
1072 }
1073
1074 # everything is ready to go
1075 return $collection;
1076}
1077
1078sub get_current_collection_name {
1079 return $ENV{'GSDLCOLLECTION'};
1080}
1081
1082
1083# will return the collection name if successful, "" otherwise.
1084# Like use_collection (above) but for greenstone 3 (taking account of site level)
1085
1086sub use_site_collection {
1087 my ($site, $collection, $collectdir) = @_;
1088
1089 if (!defined $collectdir || $collectdir eq "") {
1090 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1091 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1092 }
1093
1094 if (defined $ENV{'GSDL3HOME'}) {
1095 $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};
1096 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
1097 } elsif (defined $ENV{'GSDL3SRCHOME'}) {
1098 $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web");
1099 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
1100 } else {
1101 print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
1102 }
1103
1104 # collectdir explicitly set by this point (using $site variable if required).
1105 # Can call "old" gsdl2 use_collection now.
1106
1107 return use_collection($collection,$collectdir);
1108}
1109
1110
1111
1112sub locate_config_file
1113{
1114 my ($file) = @_;
1115
1116 my $locations = locate_config_files($file);
1117
1118 return shift @$locations; # returns undef if 'locations' is empty
1119}
1120
1121
1122sub locate_config_files
1123{
1124 my ($file) = @_;
1125
1126 my @locations = ();
1127
1128 if (-e $file) {
1129 # Clearly specified (most likely full filename)
1130 # No need to hunt in 'etc' directories, return value unchanged
1131 push(@locations,$file);
1132 }
1133 else {
1134 # Check for collection specific one before looking in global GSDL 'etc'
1135 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1136 my $test_collect_etc_filename
1137 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1138
1139 if (-e $test_collect_etc_filename) {
1140 push(@locations,$test_collect_etc_filename);
1141 }
1142 }
1143 my $test_main_etc_filename
1144 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
1145 if (-e $test_main_etc_filename) {
1146 push(@locations,$test_main_etc_filename);
1147 }
1148 }
1149
1150 return \@locations;
1151}
1152
1153
1154sub hyperlink_text
1155{
1156 my ($text) = @_;
1157
1158 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1159 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1160
1161 return $text;
1162}
1163
1164
1165# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1166# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1167sub is_dir_empty {
1168 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
1169 return &FileUtils::isDirectoryEmpty(@_);
1170}
1171
1172# Returns the given filename converted using either URL encoding or base64
1173# encoding, as specified by $rename_method. If the given filename has no suffix
1174# (if it is just the tailname), then $no_suffix should be some defined value.
1175# rename_method can be url, none, base64
1176sub rename_file {
1177 my ($filename, $rename_method, $no_suffix) = @_;
1178
1179 if(!$filename) { # undefined or empty string
1180 return $filename;
1181 }
1182
1183 if (!$rename_method) {
1184 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1185 # Debugging information
1186 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1187 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1188 $rename_method = "url";
1189 } elsif($rename_method eq "none") {
1190 return $filename; # would have already been renamed
1191 }
1192
1193 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
1194 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1195 ###$filename =~ s/ /_/g;
1196
1197 my ($tailname,$dirname,$suffix);
1198 if($no_suffix) { # given a tailname, no suffix
1199 ($tailname,$dirname) = File::Basename::fileparse($filename);
1200 }
1201 else {
1202 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1203 }
1204 if (!$suffix) {
1205 $suffix = "";
1206 }
1207 # This breaks GLI matching extracted metadata to files in Enrich panel, as
1208 # original is eg .JPG while gsdlsourcefilename ends up .jpg
1209 # Not sure why it was done in first place...
1210 #else {
1211 #$suffix = lc($suffix);
1212 #}
1213
1214 if ($rename_method eq "url") {
1215 $tailname = &unicode::url_encode($tailname);
1216 }
1217 elsif ($rename_method eq "base64") {
1218 $tailname = &unicode::force_base64_encode($tailname);
1219 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1220 }
1221
1222 $filename = "$tailname$suffix";
1223 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1224
1225 return $filename;
1226}
1227
1228
1229# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1230sub rename_ldb_or_bdb_file {
1231 my ($filename_no_ext) = @_;
1232
1233 my $new_filename = "$filename_no_ext.gdb";
1234 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1235 # try ldb
1236 my $old_filename = "$filename_no_ext.ldb";
1237
1238 if (-f $old_filename) {
1239 print STDERR "Renaming $old_filename to $new_filename\n";
1240 rename ($old_filename, $new_filename)
1241 || print STDERR "Rename failed: $!\n";
1242 return;
1243 }
1244 # try bdb
1245 $old_filename = "$filename_no_ext.bdb";
1246 if (-f $old_filename) {
1247 print STDERR "Renaming $old_filename to $new_filename\n";
1248 rename ($old_filename, $new_filename)
1249 || print STDERR "Rename failed: $!\n";
1250 return;
1251 }
1252}
1253
1254sub os_dir() {
1255
1256 my $gsdlarch = "";
1257 if(defined $ENV{'GSDLARCH'}) {
1258 $gsdlarch = $ENV{'GSDLARCH'};
1259 }
1260 return $ENV{'GSDLOS'}.$gsdlarch;
1261}
1262
1263# returns 1 if this (GS server) is a GS3 installation, returns 0 if it's GS2.
1264sub is_gs3() {
1265 if($ENV{'GSDL3SRCHOME'}) {
1266 return 1;
1267 } else {
1268 return 0;
1269 }
1270}
1271
1272# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1273# By default, /greenstone3 for GS3 or /greenstone for GS2.
1274sub get_greenstone_url_prefix() {
1275 # if already set on a previous occasion, just return that
1276 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1277 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1278
1279 my ($configfile, $urlprefix, $defaultUrlprefix);
1280 my @propertynames = ();
1281
1282 if($ENV{'GSDL3SRCHOME'}) {
1283 $defaultUrlprefix = "/greenstone3";
1284 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1285 push(@propertynames, qw/path\s*\=/);
1286 } else {
1287 $defaultUrlprefix = "/greenstone";
1288 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1289 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1290 }
1291
1292 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1293
1294 if(!$urlprefix) { # no values found for URL prefix, use default values
1295 $urlprefix = $defaultUrlprefix;
1296 } else {
1297 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1298 $urlprefix =~ s/^\///; # remove the starting slash
1299 my @dirs = split(/(\\|\/)/, $urlprefix);
1300 $urlprefix = shift(@dirs);
1301
1302 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1303 $urlprefix = "/$urlprefix";
1304 }
1305 }
1306
1307 # set for the future
1308 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1309 # print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1310 return $urlprefix;
1311}
1312
1313
1314
1315#
1316# The following comes from activate.pl
1317#
1318# Designed to work with a server included with GS.
1319# - For GS2, we derive the URL from the llssite.cfg file.
1320# - For GS3, we ask ant for the library URL. For GS3, we get the local *http* URL
1321# by default, something like http://127.0.0.1:<httpPort>/greenstone3/library).
1322# Pass in $get_public_url=1 to get something like
1323# <default.protocol>://<tomcat.server>:<default.port>/greenstone/library
1324
1325sub get_full_greenstone_url_prefix
1326{
1327 my ($gs_mode, $lib_name, $get_public_url) = @_;
1328
1329 # if already set on a previous occasion, just return that
1330 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1331 return $ENV{'FULL_GREENSTONE_URL_PREFIX'} if($ENV{'FULL_GREENSTONE_URL_PREFIX'});
1332
1333 # set gs_mode if it was not passed in (servercontrol.pm would pass it in, any other callers won't)
1334 $gs_mode = ($ENV{'GSDL3SRCHOME'}) ? "gs3" : "gs2" unless defined $gs_mode;
1335
1336 my $url = undef;
1337
1338 if($gs_mode eq "gs2") {
1339 my $llssite_cfg = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "llssite.cfg");
1340
1341 if(-f $llssite_cfg) {
1342 # check llssite.cfg for line with url property
1343 # for server.exe also need to use portnumber and enterlib properties
1344 # The following file reading section is a candidate to use FileUtils::readUTF8File()
1345 # in place of calling sysread() directly. But only if we can reason we'd be working with UTF8
1346 # Read in the entire contents of the file in one hit
1347 if (!open (FIN, $llssite_cfg)) {
1348 print STDERR "util::get_full_greenstone_url_prefix() failed to open $llssite_cfg ($!)\n";
1349 return undef;
1350 }
1351
1352 my $contents;
1353 sysread(FIN, $contents, -s FIN);
1354 close(FIN);
1355
1356 my @lines = split(/[\n\r]+/, $contents); # split on carriage-returns and/or linefeeds
1357 my $enterlib = "";
1358 my $portnumber = "8282"; # will remain empty (implicit port 80) unless it's specifically been assigned
1359
1360 foreach my $line (@lines) {
1361 if($line =~ m/^url=(.*)$/) {
1362 $url = $1;
1363 } elsif($line =~ m/^enterlib=(.*)$/) {
1364 $enterlib = $1;
1365 } elsif($line =~ m/^portnumber=(.*)$/) {
1366 $portnumber = $1;
1367 }
1368 }
1369
1370 if(!$url) {
1371 return undef;
1372 }
1373 elsif($url eq "URL_pending") { # library is not running
1374 # do not process url=URL_pending in the file, since for server.exe
1375 # this just means the Enter Library button hasn't been pressed yet
1376 $url = undef;
1377 }
1378 else {
1379 # In the case of server.exe, need to do extra work to get the proper URL
1380 # But first, need to know whether we're indeed dealing with server.exe:
1381
1382 # compare the URL's domain to the full URL
1383 # E.g. for http://localhost:8383/greenstone3/cgi-bin, the domain is localhost:8383
1384 my $uri = URI->new( $url );
1385 my $host = $uri->host;
1386 #print STDERR "@@@@@ host: $host\n";
1387 if($url =~ m/https?:\/\/$host(\/)?$/) {
1388 #if($url !~ m/https?:\/\/$host:$portnumber(\/)?/ || $url =~ m/https?:\/\/$host(\/)?$/) {
1389 # (if the URL does not contain the portnumber, OR if the port is implicitly 80 and)
1390 # If the domain with http:// prefix is completely the same as the URL, assume server.exe
1391 # then the actual URL is the result of suffixing the port and enterlib properties in llssite.cfg
1392 $url = $url.":".$portnumber.$enterlib;
1393 } # else, apache web server
1394
1395 }
1396 }
1397 } elsif($gs_mode eq "gs3") {
1398 # Either check build.properties for tomcat.server, tomcat.port and app.name (and default servlet name).
1399 # app.name is stored in app.path by build.xml. Need to move app.name in build.properties from build.xml
1400
1401 # Or, run the new target get-local-http-servlet-url / get-default-servlet-url
1402 # the output can look like:
1403 #
1404 # Buildfile: build.xml
1405 # [echo] os.name: Windows Vista
1406 #
1407 # get-default-servlet-url:
1408 # [echo] http://localhost:8383/greenstone3/library
1409 # BUILD SUCCESSFUL
1410 # Total time: 0 seconds
1411
1412 #my $output = qx/ant get-default-servlet-url/; # backtick operator, to get STDOUT (else 2>&1)
1413 # - see http://stackoverflow.com/questions/799968/whats-the-difference-between-perls-backticks-system-and-exec
1414
1415 # The get-local-http-servlet-url (or get-default-servlet-url) ant target can be run from anywhere by specifying the
1416 # location of GS3's ant build.xml buildfile. Activate.pl can be run from anywhere for GS3
1417 # GSDL3SRCHOME will be set for GS3 by gs3-setup.sh, a step that would have been necessary
1418 # to run the activate.pl script in the first place
1419
1420 # The default is to get-local-http-servlet-url (of the form http://127.0.0.1:<httpPort>/greentone3/library)
1421 my $full_build_xml = &FileUtils::javaFilenameConcatenate($ENV{'GSDL3SRCHOME'},"build.xml");
1422
1423 my $perl_command = $get_public_url ? "get-default-servlet-url" : "get-local-http-servlet-url";
1424 $perl_command = "ant -buildfile \"$full_build_xml\" $perl_command";
1425
1426 if (open(PIN, "$perl_command |")) {
1427 while (defined (my $perl_output_line = <PIN>)) {
1428
1429 if($perl_output_line =~ m@(https?):\/\/(\S*)@) { # grab all the non-whitespace chars
1430 $url="$1://".$2; # preserve the http protocol #$url="http://".$1;
1431 }
1432 }
1433 close(PIN);
1434
1435 # url can be undef if tomcat.port could not be determined due to
1436 # user having wrong or conflicting server related vals in build.props
1437 if (defined $url && defined $lib_name && $lib_name ne "") {
1438 # replace the servlet_name portion of the url found, with the given library_name
1439 $url =~ s@/[^/]*$@/$lib_name@;
1440 }
1441 } else {
1442 print STDERR "util::get_full_greenstone_url_prefix() failed to run $perl_command to work out library URL for $gs_mode\n";
1443 }
1444 }
1445
1446 # either the url is still undef or it is now set
1447 #print STDERR "\n@@@@@ final URL:|$url|\n" if $url;
1448 #print STDERR "\n@@@@@ URL still undef\n" if !$url;
1449
1450 $ENV{'FULL_GREENSTONE_URL_PREFIX'} = $url;
1451
1452 return $url;
1453}
1454
1455
1456# Given a config file (xml or java properties file) and a list/array of regular expressions
1457# that represent property names to match on, this function will return the value for the 1st
1458# matching property name. If the return value is undefined, no matching property was found.
1459sub extract_propvalue_from_file() {
1460 my ($configfile, $propertynames) = @_;
1461
1462 my $value;
1463 unless(open(FIN, "<$configfile")) {
1464 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1465 return $value; # not initialised
1466 }
1467
1468 # Read the entire file at once, as one single line, then close it
1469 my $filecontents;
1470 {
1471 local $/ = undef;
1472 $filecontents = <FIN>;
1473 }
1474 close(FIN);
1475
1476 foreach my $regex (@$propertynames) {
1477 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1478 if($value) {
1479 $value =~ s/^\"//; # remove any startquotes
1480 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1481 last; # found value for a matching property, break from loop
1482 }
1483 }
1484
1485 return $value;
1486}
1487
1488# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1489# given that perllib is in @INC in order to invoke this subroutine.
1490# Call as follows -- after setting up INC to include perllib and
1491# after setting up GSDLHOME and GSDLOS:
1492#
1493# require util;
1494# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1495#
1496sub setup_greenstone_env() {
1497 my ($GSDLHOME, $GSDLOS) = @_;
1498
1499 #my %env_map = ();
1500 # Get the localised ENV settings of running a localised source setup.bash
1501 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1502 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1503 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1504 if (($GSDLOS =~ m/windows/i) && ($^O ne "cygwin")) {
1505 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1506 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1507 }
1508 if (!open(PIN, "$perl_command |")) {
1509 print STDERR ("Unable to execute command: $perl_command. $!\n");
1510 }
1511
1512 my $lastkey;
1513 while (defined (my $perl_output_line = <PIN>)) {
1514 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1515 if(defined $key) {
1516 #$env_map{$key}=$value;
1517 $ENV{$key}=$value;
1518 $lastkey = $key;
1519 } elsif($lastkey && $perl_output_line !~ m/^\s*$/) {
1520 # there was no equals sign in $perl_output_line, so this
1521 # $perl_output_line may be a spillover from the previous
1522 $ENV{$lastkey} = $ENV{$lastkey}."\n".$perl_output_line;
1523 }
1524 }
1525 close (PIN);
1526
1527 # If any keys in $ENV don't occur in Greenstone's localised env
1528 # (stored in $env_map), delete those entries from $ENV
1529 #foreach $key (keys %ENV) {
1530 # if(!defined $env_map{$key}) {
1531 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{$key}\n";
1532 # delete $ENV{$key}; # del $ENV(key, value) pair
1533 # }
1534 #}
1535 #undef %env_map;
1536}
1537
1538sub get_perl_exec() {
1539 my $perl_exec = $^X; # may return just "perl"
1540
1541 if($ENV{'PERLPATH'}) {
1542 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1543 if (($ENV{'GSDLOS'} =~ m/windows/) && ($^O ne "cygwin")) {
1544 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1545 } else {
1546 $perl_exec = "$ENV{'PERLPATH'}/perl";
1547 }
1548 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1549 # containing the full path to the current perl executable we're using
1550 $perl_exec = $Config{perlpath}; # configured path for perl
1551 if (!-e $perl_exec) { # may not point to location on this machine
1552 $perl_exec = $^X; # may return just "perl"
1553 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1554 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1555 }
1556 }
1557 }
1558
1559 return $perl_exec;
1560}
1561
1562# returns the path to the java command in the JRE included with GS (if any),
1563# quoted to safeguard any spaces in this path, otherwise a simple java
1564# command is returned which assumes and will try for a system java.
1565sub get_java_command {
1566 my $java = "java";
1567 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1568 # after running setup.bat or from GLI which also runs setup.bat
1569 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1570 if(-d $java_bin) {
1571 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1572 $java = "\"".$java."\""; # quoted to preserve spaces in path
1573 }
1574 }
1575 return $java;
1576}
1577
1578
1579# Given the qualified collection name (colgroup/collection),
1580# returns the collection and colgroup parts
1581sub get_collection_parts {
1582 # http://perldoc.perl.org/File/Basename.html
1583 # my($filename, $directories, $suffix) = fileparse($path);
1584 # "$directories contains everything up to and including the last directory separator in the $path
1585 # including the volume (if applicable). The remainder of the $path is the $filename."
1586 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1587
1588 my $qualified_collection = shift(@_);
1589
1590 # Since activate.pl can be launched from the command-line, including by a user,
1591 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1592 # Also allow for the accidental inclusion of multiple slashes
1593 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1594
1595 if(!defined $collection) {
1596 $collection = $colgroup;
1597 $colgroup = "";
1598 }
1599 return ($collection, $colgroup);
1600}
1601
1602# work out the "collectdir/collection" location
1603sub resolve_collection_dir {
1604 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1605
1606 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1607 return $ENV{'GSDLCOLLECTDIR'};
1608 }
1609
1610 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1611
1612 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1613 $collect_dir = &util::get_working_collect_dir($site);
1614 }
1615
1616 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1617}
1618
1619# work out the full path to "collect" of this greenstone 2/3 installation
1620sub get_working_collect_dir {
1621 my ($site) = @_;
1622
1623 if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1624 return $ENV{'GSDLCOLLECTHOME'};
1625 }
1626
1627 if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1628
1629 if (defined $ENV{'GSDL3HOME'}) {
1630 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1631 }
1632 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1633 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1634 }
1635 }
1636
1637 elsif (defined $ENV{'SITEHOME'}) {
1638 return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1639 }
1640
1641 else { # get default collect dir for GS2
1642 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1643 }
1644}
1645
1646sub is_abs_path_any_os {
1647 my ($path) = @_;
1648
1649 # We can have filenames in our DBs that were produced on other OS, so this method exists
1650 # to help identify absolute paths in such cases.
1651
1652 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1653 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1654
1655 return 0;
1656}
1657
1658
1659# This subroutine is for improving portability of Greenstone collections from one OS to another,
1660# to be used to convert absolute paths going into db files into paths with placeholders instead.
1661# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1662# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1663# path with a placeholder to match.
1664# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1665# the path to the current (specific) collection, the path to the general GS collect directory,
1666# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1667# The longest matching prefix will be replaced with the equivalent placeholder:
1668# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1669sub abspath_to_placeholders {
1670 my $path = shift(@_); # path to convert from absolute to one with placeholders
1671 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1672
1673 return $path unless is_abs_path_any_os($path); # path is relative
1674
1675 if ($opt_long_or_short_winfilenames eq "long") {
1676 $path = &util::upgrade_if_dos_filename($path); # will only do something on windows
1677 }
1678
1679 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1680 my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $ENV{'SITEHOME'}, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1681
1682 my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1683 $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1684 $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1685 );
1686 $placeholder_map{$ENV{'SITEHOME'}} = '@SITEHOME@' if defined $ENV{'SITEHOME'};
1687
1688 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1689
1690 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1691 # for windows need to look for matches on short file names too
1692 # matched paths are again to be replaced with the usual placeholders
1693
1694 my $gsdlcollectdir = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'});
1695 my $gsdlcollecthome = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'});
1696 my $sitehome = (defined $ENV{'SITEHOME'}) ? &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) : undef;
1697 my $greenstonehome = &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'});
1698
1699 @gs_paths = ($gsdlcollectdir, $gsdlcollecthome, $sitehome, $greenstonehome); # order matters
1700
1701 %placeholder_map = ($greenstonehome => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1702 $gsdlcollecthome => '@COLLECTHOME@',
1703 $gsdlcollectdir => '@THISCOLLECTPATH@'
1704 );
1705 $placeholder_map{$sitehome} = '@SITEHOME@' if defined $sitehome;
1706
1707 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1708 }
1709
1710 return $path;
1711}
1712
1713sub _abspath_to_placeholders {
1714 my ($path, $gs_paths_ref, $placeholder_map_ref) = @_;
1715
1716 # The sequence of elements in @gs_paths matters
1717 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1718 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1719
1720 foreach my $gs_path (@$gs_paths_ref) {
1721 next if(!defined $gs_path); # site undefined for GS2
1722
1723 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1724
1725 if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1726
1727 my $placeholder = $placeholder_map_ref->{$gs_path}; # get the placeholder to replace the matched path with
1728
1729 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1730 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1731 # lowercase file extension, This is needed when shortfilenames are used, as case affects alphetical ordering, which affects diffcol
1732 $path =~ s/\.([A-Z]+)$/".".lc($1)/e;
1733 last; # done
1734 }
1735 }
1736
1737 return $path;
1738}
1739
1740# Function that does the reverse of the util::abspath_to_placeholders subroutine
1741# Once again, call this with the values returned from util::get_common_gs_paths
1742sub placeholders_to_abspath {
1743 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1744 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1745
1746 return $path if($path !~ m/@/); # path contains no placeholders
1747
1748 # replace placeholders with gs prefixes
1749 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1750 # but listed here from longest to shortest once placeholders are have been resolved
1751
1752 # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1753 my %placeholder_to_gspath_map;
1754 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1755 # always replace placeholders with short file names of the absolute paths on windows?
1756 %placeholder_to_gspath_map = ('@GSDLHOME@' => &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'}),
1757 '@COLLECTHOME@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'}),
1758 '@THISCOLLECTPATH@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'})
1759 );
1760 $placeholder_to_gspath_map{'@SITEHOME@'} = &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) if defined $ENV{'SITEHOME'};
1761 } else {
1762 %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'},
1763 '@SITEHOME@' => $ENV{'SITEHOME'}, # can be undef
1764 '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1765 '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1766 ); # $placeholder_to_gspath_map{'@SITEHOME@'} = $ENV{'SITEHOME'} if defined $ENV{'SITEHOME'};
1767 }
1768
1769 foreach my $placeholder (@placeholders) {
1770 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1771
1772 next if(!defined $gs_path); # sitehome for GS2 is undefined
1773
1774 if($path =~ m/^$placeholder/) {
1775 $path =~ s/^$placeholder/$gs_path/;
1776 last; # done
1777 }
1778 }
1779
1780 return $path;
1781}
1782
1783# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1784# a directory containing sequentially numbered images (and optional matching sequentially numbered .txt files).
1785sub create_itemfile
1786{
1787 my ($output_dir, $convert_basename, $convert_to) = @_;
1788 my $page_num = "";
1789
1790 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1791 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1792 closedir DIR;
1793
1794 # Sort files in the directory by page_num
1795 sub page_number {
1796 my ($dir) = @_;
1797 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png|txt))?$/i);
1798 # my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1799
1800 $pagenum = 1 unless defined $pagenum;
1801 return $pagenum;
1802 }
1803
1804 # sort the files in the directory in the order of page_num rather than lexically.
1805 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1806
1807 # work out if the numbering of the now sorted image files starts at 0 or not
1808 # by checking the number of the first _image_ file (skipping item files)
1809 my $starts_at_0 = 0;
1810 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1811 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1812 $starts_at_0 = 1;
1813 }
1814
1815 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1816 my $item_fh;
1817 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1818 print $item_fh "<PagedDocument>\n";
1819
1820 # In the past, sub create_itemfile() never output txtfile names into the item file (they were left as empty strings),
1821 # only image file names. Now that PDFBox is being customised for GS with the new GS_PDFToImagesAndText.java class to
1822 # create images of each PDF page and extract text for that page if extractable, we can have matching txt files for
1823 # each img file. So now we can output txt file names if we're working with txt files.
1824 # We just test if a text file exists in the same dir that matches the name of the first image file
1825 # if a matching txt file does not exist, don't output txtfile names into the item file
1826
1827 my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($firstfile, "\\.[^\\.]+\$"); # relative filenames so no dirname
1828 my $txtfilename = &FileUtils::filenameConcatenate($output_dir, $tailname . ".txt");
1829 my $hasTxtFile = &FileUtils::fileExists($txtfilename);
1830
1831 # Write out the elements of the item file.
1832 # We could be dealing with 3 types of conversion output formats: txt only (paged_text),
1833 # images only (pagedimg_) and images AND text (pagedimgtxt_).
1834 foreach my $file (@dir_files) {
1835 if ($file !~ /\.item/i) {
1836 $page_num = page_number($file);
1837 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1838
1839 if ($convert_to eq "txt") { # output format is paged_text, which has no images
1840 if ($file =~ m/\.txt/i) { # check only txt files (should be all there is, besides the skipped .item file)
1841 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"\" txtfile=\"$page_num.txt\"/>\n";
1842 } # else, some non-txt file ext, skip
1843 }
1844 else { # either pagedimg or pagedimgtxt output mode
1845 if($file !~ /\.txt/i) { # check only img files, skip any matching txt files
1846 if($hasTxtFile) { # if every image has a matching txt file, output txtfile too
1847 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"$page_num.txt\"/>\n";
1848 } else { # when its pagedimg only, txtfile is empty
1849 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1850 }
1851 }
1852 }
1853 }
1854 }
1855
1856
1857 print $item_fh "</PagedDocument>\n";
1858 &FileUtils::closeFileHandle($item_file, \$item_fh);
1859 return $item_file;
1860}
1861
1862# Sets the gnomelib_env. Based on the logic in wvware.pl which can perhaps be replaced with a call to this function in future
1863sub set_gnomelib_env
1864{
1865 ## SET THE ENVIRONMENT AS DONE IN SETUP.BASH/BAT OF GNOME-LIB
1866 # Though this is only needed for darwin Lion at this point (and android, though that is untested)
1867
1868 my $libext = "so";
1869 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1870 return;
1871 } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
1872 $libext = "dylib";
1873 }
1874
1875 if (!defined $ENV{'GEXTGNOME'}) {
1876 ##print STDERR "@@@ Setting GEXTGNOME env\n";
1877
1878 my $gnome_dir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext","gnome-lib-minimal");
1879
1880 if(! -d $gnome_dir) {
1881 $gnome_dir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"ext","gnome-lib");
1882
1883 if(! -d $gnome_dir) {
1884 $gnome_dir = "";
1885 }
1886 }
1887
1888 # now set other the related env vars,
1889 # IF we've found the gnome-lib dir installed in the ext folder
1890
1891 if ($gnome_dir ne "" && -f &FileUtils::filenameConcatenate($gnome_dir, $ENV{'GSDLOS'}, "lib", "libiconv.$libext")) {
1892 $ENV{'GEXTGNOME'} = $gnome_dir;
1893 $ENV{'GEXTGNOME_INSTALLED'}=&FileUtils::filenameConcatenate($ENV{'GEXTGNOME'}, $ENV{'GSDLOS'});
1894
1895 my $gnomelib_bin = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "bin");
1896 if(-d $gnomelib_bin) { # no bin subfolder in GS binary's cutdown gnome-lib-minimal folder
1897 &util::envvar_prepend("PATH", $gnomelib_bin);
1898 }
1899
1900 # util's prepend will create LD/DYLD_LIB_PATH if it doesn't exist yet
1901 my $gextlib = &FileUtils::filenameConcatenate($ENV{'GEXTGNOME_INSTALLED'}, "lib");
1902
1903 if($ENV{'GSDLOS'} eq "linux") {
1904 &util::envvar_prepend("LD_LIBRARY_PATH", $gextlib);
1905 }
1906 elsif ($ENV{'GSDLOS'} eq "darwin") {
1907 #&util::envvar_prepend("DYLD_LIBRARY_PATH", $gextlib);
1908 &util::envvar_prepend("DYLD_FALLBACK_LIBRARY_PATH", $gextlib);
1909 }
1910 }
1911
1912 # Above largely mimics the setup.bash of the gnome-lib-minimal.
1913 # Not doing the devel-srcpack that gnome-lib-minimal's setup.bash used to set
1914 # Not exporting GSDLEXTS variable either
1915 }
1916
1917 # print STDERR "@@@@@ GEXTGNOME: ".$ENV{'GEXTGNOME'}."\n\tINSTALL".$ENV{'GEXTGNOME_INSTALLED'}."\n";
1918 # print STDERR "\tPATH".$ENV{'PATH'}."\n";
1919 # print STDERR "\tLD_LIB_PATH".$ENV{'LD_LIBRARY_PATH'}."\n" if $ENV{'LD_LIBRARY_PATH};
1920 # print STDERR "\tDYLD_FALLBACK_LIB_PATH".$ENV{'DYLD_FALLBACK_LIBRARY_PATH'}."\n" if $ENV{'DYLD_FALLBACK_LIBRARY_PATH};
1921
1922 # if no GEXTGNOME, maybe users didn't need gnome-lib to run gnomelib/libiconv dependent binaries like hashfile, suffix, wget
1923 # (wvware is launched in a gnomelib env from its own script, but could possibly go through this script in future)
1924}
1925
1926
1927
1928## @function augmentINC()
1929#
1930# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1931# @param $new_path The path to add
1932# @author jmt12
1933#
1934sub augmentINC
1935{
1936 my ($new_path) = @_;
1937 my $did_add_path = 0;
1938 # might need to be replaced with FileUtils::directoryExists() call eventually
1939 if (-d $new_path)
1940 {
1941 my $did_find_path = 0;
1942 foreach my $existing_path (@INC)
1943 {
1944 if ($existing_path eq $new_path)
1945 {
1946 $did_find_path = 1;
1947 last;
1948 }
1949 }
1950 if (!$did_find_path)
1951 {
1952 unshift(@INC, $new_path);
1953 $did_add_path = 1;
1954 }
1955 }
1956 return $did_add_path;
1957}
1958## augmentINC()
1959
1960
19611;
Note: See TracBrowser for help on using the repository browser.