source: main/trunk/greenstone2/perllib/util.pm@ 28177

Last change on this file since 28177 was 28177, checked in by ak19, 11 years ago

Added new subroutines to convert from absolute paths to Greenstone locations into paths containing a placeholder for the longest Greenstone prefix matched (path to collection folder, collect folder, site folder or else to GSDLHOME/GSDL3HOME folder). The reverse function has also been added. These haven't been used yet where they need to be, but have been tested to work when called from a testing script.

  • Property svn:keywords set to Author Date Id Revision
File size: 47.7 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29use FileUtils;
30
31use Encode;
32use File::Copy;
33use File::Basename;
34# Config for getting the perlpath in the recommended way, though it uses paths that are
35# hard-coded into the Config file that's generated upon configuring and compiling perl.
36# $^X works better in some cases to return the path to perl used to launch the script,
37# but if launched with plain "perl" (no full-path), that will be just what it returns.
38use Config;
39# New module for file related utility functions - intended as a
40# placeholder for an extension that allows a variety of different
41# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
42use FileUtils;
43
44# removes files (but not directories)
45sub rm {
46 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
47 return &FileUtils::removeFiles(@_);
48}
49
50# recursive removal
51sub filtered_rm_r {
52 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
53 return &FileUtils::removeFilesFiltered(@_);
54}
55
56# recursive removal
57sub rm_r {
58 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
59 return &FileUtils::removeFilesRecursive(@_);
60}
61
62# moves a file or a group of files
63sub mv {
64 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
65 return &FileUtils::moveFiles(@_);
66}
67
68# Move the contents of source directory into target directory
69# (as opposed to merely replacing target dir with the src dir)
70# This can overwrite any files with duplicate names in the target
71# but other files and folders in the target will continue to exist
72sub mv_dir_contents {
73 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
74 return &FileUtils::moveDirectoryContents(@_);
75}
76
77# copies a file or a group of files
78sub cp {
79 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
80 return &FileUtils::copyFiles(@_);
81}
82
83# recursively copies a file or group of files
84# syntax: cp_r (sourcefiles, destination directory)
85# destination must be a directory - to copy one file to
86# another use cp instead
87sub cp_r {
88 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
89 return &FileUtils::copyFilesRecursive(@_);
90}
91
92# recursively copies a file or group of files
93# syntax: cp_r (sourcefiles, destination directory)
94# destination must be a directory - to copy one file to
95# another use cp instead
96sub cp_r_nosvn {
97 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
98 return &FileUtils::copyFilesRecursiveNoSVN(@_);
99}
100
101# copies a directory and its contents, excluding subdirectories, into a new directory
102sub cp_r_toplevel {
103 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
104 return &FileUtils::recursiveCopyTopLevel(@_);
105}
106
107sub mk_dir {
108 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
109 return &FileUtils::makeDirectory(@_);
110}
111
112# in case anyone cares - I did some testing (using perls Benchmark module)
113# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
114# slightly faster (surprisingly) - Stefan.
115sub mk_all_dir {
116 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
117 return &FileUtils::makeAllDirectories(@_);
118}
119
120# make hard link to file if supported by OS, otherwise copy the file
121sub hard_link {
122 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
123 return &FileUtils::hardLink(@_);
124}
125
126# make soft link to file if supported by OS, otherwise copy file
127sub soft_link {
128 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
129 return &FileUtils::softLink(@_);
130}
131
132# Primarily for filenames generated by processing
133# content of HTML files (which are mapped to UTF-8 internally)
134#
135# To turn this into an octet string that really exists on the file
136# system:
137# 1. don't need to do anything special for Unix-based systems
138# (as underlying file system is byte-code)
139# 2. need to map to short DOS filenames for Windows
140
141sub utf8_to_real_filename
142{
143 my ($utf8_filename) = @_;
144
145 my $real_filename;
146
147 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
148 require Win32;
149
150 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
151
152 my $unicode_filename = decode("utf8",$utf8_filename);
153 $real_filename = Win32::GetShortPathName($unicode_filename);
154 }
155 else {
156 $real_filename = $utf8_filename;
157 }
158
159 return $real_filename;
160}
161
162sub fd_exists {
163 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
164 return &FileUtils::fileTest(@_);
165}
166
167sub file_exists {
168 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
169 return &FileUtils::fileExists(@_);
170}
171
172sub dir_exists {
173 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
174 return &FileUtils::directoryExists(@_);
175}
176
177# updates a copy of a directory in some other part of the filesystem
178# verbosity settings are: 0=low, 1=normal, 2=high
179# both $fromdir and $todir should be absolute paths
180sub cachedir {
181 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
182 return &FileUtils::synchronizeDirectories(@_);
183}
184
185# this function returns -1 if either file is not found
186# assumes that $file1 and $file2 are absolute file names or
187# in the current directory
188# $file2 is allowed to be newer than $file1
189sub differentfiles {
190 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
191 return &FileUtils::differentFiles(@_);
192}
193
194
195sub get_tmp_filename
196{
197 my $file_ext = shift(@_) || undef;
198
199 my $opt_dot_file_ext = "";
200 if (defined $file_ext) {
201 if ($file_ext !~ m/\./) {
202 # no dot, so needs one added in at start
203 $opt_dot_file_ext = ".$file_ext"
204 }
205 else {
206 # allow for "extensions" such as _metadata.txt to be handled
207 # gracefully
208 $opt_dot_file_ext = $file_ext;
209 }
210 }
211
212 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
213 &FileUtils::makeAllDirectories ($tmpdir) unless -e $tmpdir;
214
215 my $count = 1000;
216 my $rand = int(rand $count);
217 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
218
219 while (-e $full_tmp_filename) {
220 $rand = int(rand $count);
221 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
222 $count++;
223 }
224
225 return $full_tmp_filename;
226}
227
228# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
229# belong with that function. They help ensure the timestamped tmp folders generated are unique.
230my $previous_timestamp = undef;
231my $previous_timestamp_f = 0; # frequency
232
233sub get_timestamped_tmp_folder
234{
235
236 my $tmp_dirname;
237 if(defined $ENV{'GSDLCOLLECTDIR'}) {
238 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
239 } elsif(defined $ENV{'GSDLHOME'}) {
240 $tmp_dirname = $ENV{'GSDLHOME'};
241 } else {
242 return undef;
243 }
244
245 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
246 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
247
248 # add the timestamp into the path otherwise we can run into problems
249 # if documents have the same name
250 my $timestamp = time;
251
252 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
253 $previous_timestamp_f = 0;
254 $previous_timestamp = $timestamp;
255 } else {
256 $previous_timestamp_f++;
257 }
258
259 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
260 $tmp_dirname = $time_tmp_dirname;
261 my $i = $previous_timestamp_f;
262
263 if($previous_timestamp_f > 0) {
264 $tmp_dirname = $time_tmp_dirname."_".$i;
265 $i++;
266 }
267 while (-e $tmp_dirname) {
268 $tmp_dirname = $time_tmp_dirname."_".$i;
269 $i++;
270 }
271 &FileUtils::makeDirectory($tmp_dirname);
272
273 return $tmp_dirname;
274}
275
276sub get_timestamped_tmp_filename_in_collection
277{
278
279 my ($input_filename, $output_ext) = @_;
280 # derive tmp filename from input filename
281 my ($tailname, $dirname, $suffix)
282 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
283
284 # softlink to collection tmp dir
285 my $tmp_dirname = &util::get_timestamped_tmp_folder();
286 $tmp_dirname = $dirname unless defined $tmp_dirname;
287
288 # following two steps copied from ConvertBinaryFile
289 # do we need them?? can't use them as is, as they use plugin methods.
290
291 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
292
293 # URLEncode this since htmls with images where the html filename is utf8 don't seem
294 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
295 # files on the filesystem.
296 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
297 if (defined $output_ext) {
298 $output_ext = ".$output_ext"; # add the dot
299 } else {
300 $output_ext = $suffix;
301 }
302 $output_ext= lc($output_ext);
303 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
304
305 return $tmp_filename;
306}
307
308sub get_toplevel_tmp_dir
309{
310 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
311}
312
313
314sub filename_to_regex {
315 my $filename = shift (@_);
316
317 # need to make single backslashes double so that regex works
318 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
319
320 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
321 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
322 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
323 $filename =~ s@\(@\\(@g; # escape brackets
324 $filename =~ s@\)@\\)@g; # escape brackets
325 $filename =~ s@\[@\\[@g; # escape brackets
326 $filename =~ s@\]@\\]@g; # escape brackets
327
328 return $filename;
329}
330
331sub unregex_filename {
332 my $filename = shift (@_);
333
334 # need to put doubled backslashes for regex back to single
335 $filename =~ s/\\\./\./g; # remove RE syntax for .
336 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
337 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
338 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
339 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
340
341 # \\ goes to \
342 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
343 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
344 return $filename;
345}
346
347sub filename_cat {
348 # I've disabled this warning for now, as every Greenstone perl
349 # script seems to make use of this function and so you drown in a
350 # sea of deprecated warnings [jmt12]
351# warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
352 return &FileUtils::filenameConcatenate(@_);
353}
354
355
356sub pathname_cat {
357 my $first_path = shift(@_);
358 my (@pathnames) = @_;
359
360 # If first_path is not null or empty, then add it back into the list
361 if (defined $first_path && $first_path =~ /\S/) {
362 unshift(@pathnames, $first_path);
363 }
364
365 my $join_char;
366 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
367 $join_char = ";";
368 } else {
369 $join_char = ":";
370 }
371
372 my $pathname = join($join_char, @pathnames);
373
374 # remove duplicate slashes
375 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
376 $pathname =~ s/[\\\/]+/\\/g;
377 } else {
378 $pathname =~ s/[\/]+/\//g;
379 # DB: want a pathname abc\de.html to remain like this
380 }
381
382 return $pathname;
383}
384
385
386sub tidy_up_oid {
387 my ($OID) = @_;
388 if ($OID =~ /\./) {
389 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
390 $OID =~ s/\.//g; #remove any periods
391 }
392 if ($OID =~ /^\s.*\s$/) {
393 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
394 # remove starting and trailing whitespace
395 $OID =~ s/^\s+//;
396 $OID =~ s/\s+$//;
397 }
398 if ($OID =~ /^[\d]*$/) {
399 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
400 $OID = "D" . $OID;
401 }
402
403 return $OID;
404}
405
406sub envvar_prepend {
407 my ($var,$val) = @_;
408
409 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
410 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
411
412 # do not prepend any value/path that's already in the environment variable
413
414 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
415 if (!defined($ENV{$var})) {
416 $ENV{$var} = "$val";
417 }
418 elsif($ENV{$var} !~ m/$escaped_val/) {
419 $ENV{$var} = "$val".$pathsep.$ENV{$var};
420 }
421}
422
423sub envvar_append {
424 my ($var,$val) = @_;
425
426 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
427 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
428
429 # do not append any value/path that's already in the environment variable
430
431 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
432 if (!defined($ENV{$var})) {
433 $ENV{$var} = "$val";
434 }
435 elsif($ENV{$var} !~ m/$escaped_val/) {
436 $ENV{$var} = $ENV{$var}.$pathsep."$val";
437 }
438}
439
440
441# splits a filename into a prefix and a tail extension using the tail_re, or
442# if that fails, splits on the file_extension . (dot)
443sub get_prefix_and_tail_by_regex {
444
445 my ($filename,$tail_re) = @_;
446
447 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
448 if ((!defined $file_prefix) || (!defined $file_ext)) {
449 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
450 }
451
452 return ($file_prefix,$file_ext);
453}
454
455# get full path and file only path from a base_dir (which may be empty) and
456# file (which may contain directories)
457sub get_full_filenames {
458 my ($base_dir, $file) = @_;
459
460 my $filename_full_path = $file;
461 # add on directory if present
462 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
463
464 my $filename_no_path = $file;
465
466 # remove directory if present
467 $filename_no_path =~ s/^.*[\/\\]//;
468 return ($filename_full_path, $filename_no_path);
469}
470
471# returns the path of a file without the filename -- ie. the directory the file is in
472sub filename_head {
473 my $filename = shift(@_);
474
475 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
476 $filename =~ s/[^\\\\]*$//;
477 }
478 else {
479 $filename =~ s/[^\\\/]*$//;
480 }
481
482 return $filename;
483}
484
485
486
487# returns 1 if filename1 and filename2 point to the same
488# file or directory
489sub filenames_equal {
490 my ($filename1, $filename2) = @_;
491
492 # use filename_cat to clean up trailing slashes and
493 # multiple slashes
494 $filename1 = &FileUtils::filenameConcatenate($filename1);
495 $filename2 = &FileUtils::filenameConcatenate($filename2);
496
497 # filenames not case sensitive on windows
498 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
499 $filename1 =~ tr/[A-Z]/[a-z]/;
500 $filename2 =~ tr/[A-Z]/[a-z]/;
501 }
502 return 1 if $filename1 eq $filename2;
503 return 0;
504}
505
506# If filename is relative to within_dir, returns the relative path of filename to that directory
507# with slashes in the filename returned as they were in the original (absolute) filename.
508sub filename_within_directory
509{
510 my ($filename,$within_dir) = @_;
511
512 if ($within_dir !~ m/[\/\\]$/) {
513 my $dirsep = &util::get_dirsep();
514 $within_dir .= $dirsep;
515 }
516
517 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
518 if ($filename =~ m/^$within_dir(.*)$/) {
519 $filename = $1;
520 }
521
522 return $filename;
523}
524
525# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
526# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
527# The subpath returned will also be a URL type filename.
528sub filename_within_directory_url_format
529{
530 my ($filename,$within_dir) = @_;
531
532 # convert parameters only to / slashes if Windows
533
534 my $filename_urlformat = &filepath_to_url_format($filename);
535 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
536
537 #if ($within_dir_urlformat !~ m/\/$/) {
538 # make sure directory ends with a slash
539 #$within_dir_urlformat .= "/";
540 #}
541
542 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
543
544 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
545
546 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
547 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
548 $filename_urlformat = $1;
549 }
550
551 return $filename_urlformat;
552}
553
554# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
555# since on Linux it doesn't represent a file separator but an escape char).
556sub filepath_to_url_format
557{
558 my ($filepath) = @_;
559 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
560 # Only need to worry about Windows, as Unix style directories already in url-format
561 # Convert Windows style \ => /
562 $filepath =~ s@\\@/@g;
563 }
564 return $filepath;
565}
566
567# regex filepaths on windows may include \\ as path separator. Convert \\ to /
568sub filepath_regex_to_url_format
569{
570 my ($filepath) = @_;
571 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
572 # Only need to worry about Windows, as Unix style directories already in url-format
573 # Convert Windows style \\ => /
574 $filepath =~ s@\\\\@/@g;
575 }
576 return $filepath;
577
578}
579
580# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
581# and ignores trailing /
582# returns (file, dirs) dirs will be empty if no subdirs
583sub url_fileparse
584{
585 my ($filepath) = @_;
586 # remove trailing /
587 $filepath =~ s@/$@@;
588 if ($filepath !~ m@/@) {
589 return ($filepath, "");
590 }
591 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
592 return ($file, $dirs);
593
594}
595
596
597sub filename_within_collection
598{
599 my ($filename) = @_;
600
601 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
602
603 if (defined $collect_dir) {
604
605 # if from within GSDLCOLLECTDIR, then remove directory prefix
606 # so source_filename is realative to it. This is done to aid
607 # portability, i.e. the collection can be moved to somewhere
608 # else on the file system and the archives directory will still
609 # work. This is needed, for example in the applet version of
610 # GLI where GSDLHOME/collect on the server will be different to
611 # the collect directory of the remove user. Of course,
612 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
613 # it back into a full pathname.
614
615 $filename = filename_within_directory($filename,$collect_dir);
616 }
617
618 return $filename;
619}
620
621sub prettyprint_file
622{
623 my ($base_dir,$file,$gli) = @_;
624
625 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
626
627 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
628 require Win32;
629
630 # For some reason base_dir in the form c:/a/b/c
631 # This leads to confusion later on, so turn it back into
632 # the more usual Windows form
633 $base_dir =~ s/\//\\/g;
634 my $long_base_dir = Win32::GetLongPathName($base_dir);
635 my $long_full_path = Win32::GetLongPathName($filename_full_path);
636
637 $file = filename_within_directory($long_full_path,$long_base_dir);
638 $file = encode("utf8",$file) if ($gli);
639 }
640
641 return $file;
642}
643
644
645sub upgrade_if_dos_filename
646{
647 my ($filename_full_path,$and_encode) = @_;
648
649 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
650 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
651 # to its long (Windows) version
652 my $long_filename = Win32::GetLongPathName($filename_full_path);
653 if (defined $long_filename) {
654 $filename_full_path = $long_filename;
655 }
656 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
657 $filename_full_path =~ s/^(.):/\u$1:/;
658 if ((defined $and_encode) && ($and_encode)) {
659 $filename_full_path = encode("utf8",$filename_full_path);
660 }
661 }
662
663 return $filename_full_path;
664}
665
666
667sub downgrade_if_dos_filename
668{
669 my ($filename_full_path) = @_;
670
671 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
672 require Win32;
673
674 # Ensure the given long Windows filename is in a form that can
675 # be opened by Perl => convert it to a short DOS-like filename
676
677 my $short_filename = Win32::GetShortPathName($filename_full_path);
678 if (defined $short_filename) {
679 $filename_full_path = $short_filename;
680 }
681 # Make sure initial drive letter is lower-case (to fit in
682 # with rest of Greenstone)
683 $filename_full_path =~ s/^(.):/\u$1:/;
684 }
685
686 return $filename_full_path;
687}
688
689sub block_filename
690{
691 my ($block_hash,$filename) = @_;
692
693 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
694
695 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
696 my $lower_filename = lc($filename);
697 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
698# my $lower_drive = $filename;
699# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
700
701# my $upper_drive = $filename;
702# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
703#
704# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
705# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
706 }
707 else {
708 $block_hash->{'file_blocks'}->{$filename} = 1;
709 }
710}
711
712
713sub filename_is_absolute
714{
715 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
716 return &FileUtils::isFilenameAbsolute(@_);
717}
718
719
720## @method make_absolute()
721#
722# Ensure the given file path is absolute in respect to the given base path.
723#
724# @param $base_dir A string denoting the base path the given dir must be
725# absolute to.
726# @param $dir The directory to be made absolute as a string. Note that the
727# dir may already be absolute, in which case it will remain
728# unchanged.
729# @return The now absolute form of the directory as a string.
730#
731# @author John Thompson, DL Consulting Ltd.
732# @copy 2006 DL Consulting Ltd.
733#
734#used in buildcol.pl, doesn't work for all cases --kjdon
735sub make_absolute {
736
737 my ($base_dir, $dir) = @_;
738### print STDERR "dir = $dir\n";
739 $dir =~ s/[\\\/]+/\//g;
740 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
741 $dir =~ s|^/tmp_mnt||;
742 1 while($dir =~ s|/[^/]*/\.\./|/|g);
743 $dir =~ s|/[.][.]?/|/|g;
744 $dir =~ tr|/|/|s;
745### print STDERR "dir = $dir\n";
746
747 return $dir;
748}
749## make_absolute() ##
750
751sub get_dirsep {
752
753 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
754 return "\\";
755 } else {
756 return "\/";
757 }
758}
759
760sub get_os_dirsep {
761
762 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
763 return "\\\\";
764 } else {
765 return "\\\/";
766 }
767}
768
769sub get_re_dirsep {
770
771 return "\\\\|\\\/";
772}
773
774
775sub get_dirsep_tail {
776 my ($filename) = @_;
777
778 # returns last part of directory or filename
779 # On unix e.g. a/b.d => b.d
780 # a/b/c => c
781
782 my $dirsep = get_re_dirsep();
783 my @dirs = split (/$dirsep/, $filename);
784 my $tail = pop @dirs;
785
786 # - caused problems under windows
787 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
788
789 return $tail;
790}
791
792
793# if this is running on windows we want binaries to end in
794# .exe, otherwise they don't have to end in any extension
795sub get_os_exe {
796 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
797 return "";
798}
799
800
801# test to see whether this is a big or little endian machine
802sub is_little_endian
803{
804 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
805 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
806 # Otherwise, it's little endian
807
808 #return 0 if $^O =~ /^darwin$/i;
809 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
810
811 # Going back to stating exactly whether the machine is little endian
812 # or big endian, without any special case for Macs. Since for rata it comes
813 # back with little endian and for shuttle with bigendian.
814 return (ord(substr(pack("s",1), 0, 1)) == 1);
815}
816
817
818# will return the collection name if successful, "" otherwise
819sub use_collection {
820 my ($collection, $collectdir) = @_;
821
822 if (!defined $collectdir || $collectdir eq "") {
823 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
824 }
825
826 # get and check the collection
827 if (!defined($collection) || $collection eq "") {
828 if (defined $ENV{'GSDLCOLLECTION'}) {
829 $collection = $ENV{'GSDLCOLLECTION'};
830 } else {
831 print STDOUT "No collection specified\n";
832 return "";
833 }
834 }
835
836 if ($collection eq "modelcol") {
837 print STDOUT "You can't use modelcol.\n";
838 return "";
839 }
840
841 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
842 # are defined
843 $ENV{'GSDLCOLLECTION'} = $collection;
844 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
845
846 # make sure this collection exists
847 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
848 print STDOUT "Invalid collection ($collection).\n";
849 return "";
850 }
851
852 # everything is ready to go
853 return $collection;
854}
855
856sub get_current_collection_name {
857 return $ENV{'GSDLCOLLECTION'};
858}
859
860
861# will return the collection name if successful, "" otherwise.
862# Like use_collection (above) but for greenstone 3 (taking account of site level)
863
864sub use_site_collection {
865 my ($site, $collection, $collectdir) = @_;
866
867 if (!defined $collectdir || $collectdir eq "") {
868 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
869 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
870 }
871
872 # collectdir explicitly set by this point (using $site variable if required).
873 # Can call "old" gsdl2 use_collection now.
874
875 return use_collection($collection,$collectdir);
876}
877
878
879
880sub locate_config_file
881{
882 my ($file) = @_;
883
884 my $locations = locate_config_files($file);
885
886 return shift @$locations; # returns undef if 'locations' is empty
887}
888
889
890sub locate_config_files
891{
892 my ($file) = @_;
893
894 my @locations = ();
895
896 if (-e $file) {
897 # Clearly specified (most likely full filename)
898 # No need to hunt in 'etc' directories, return value unchanged
899 push(@locations,$file);
900 }
901 else {
902 # Check for collection specific one before looking in global GSDL 'etc'
903 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
904 my $test_collect_etc_filename
905 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
906
907 if (-e $test_collect_etc_filename) {
908 push(@locations,$test_collect_etc_filename);
909 }
910 }
911 my $test_main_etc_filename
912 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
913 if (-e $test_main_etc_filename) {
914 push(@locations,$test_main_etc_filename);
915 }
916 }
917
918 return \@locations;
919}
920
921
922sub hyperlink_text
923{
924 my ($text) = @_;
925
926 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
927 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
928
929 return $text;
930}
931
932
933# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
934# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
935sub is_dir_empty {
936 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
937 return &FileUtils::isDirectoryEmpty(@_);
938}
939
940# Returns the given filename converted using either URL encoding or base64
941# encoding, as specified by $rename_method. If the given filename has no suffix
942# (if it is just the tailname), then $no_suffix should be some defined value.
943# rename_method can be url, none, base64
944sub rename_file {
945 my ($filename, $rename_method, $no_suffix) = @_;
946
947 if(!$filename) { # undefined or empty string
948 return $filename;
949 }
950
951 if (!$rename_method) {
952 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
953 # Debugging information
954 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
955 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
956 $rename_method = "url";
957 } elsif($rename_method eq "none") {
958 return $filename; # would have already been renamed
959 }
960
961 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
962 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
963 ###$filename =~ s/ /_/g;
964
965 my ($tailname,$dirname,$suffix);
966 if($no_suffix) { # given a tailname, no suffix
967 ($tailname,$dirname) = File::Basename::fileparse($filename);
968 }
969 else {
970 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
971 }
972 if (!$suffix) {
973 $suffix = "";
974 }
975 # This breaks GLI matching extracted metadata to files in Enrich panel, as
976 # original is eg .JPG while gsdlsourcefilename ends up .jpg
977 # Not sure why it was done in first place...
978 #else {
979 #$suffix = lc($suffix);
980 #}
981
982 if ($rename_method eq "url") {
983 $tailname = &unicode::url_encode($tailname);
984 }
985 elsif ($rename_method eq "base64") {
986 $tailname = &unicode::base64_encode($tailname);
987 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
988 }
989
990 $filename = "$tailname$suffix";
991 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
992
993 return $filename;
994}
995
996
997# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
998sub rename_ldb_or_bdb_file {
999 my ($filename_no_ext) = @_;
1000
1001 my $new_filename = "$filename_no_ext.gdb";
1002 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1003 # try ldb
1004 my $old_filename = "$filename_no_ext.ldb";
1005
1006 if (-f $old_filename) {
1007 print STDERR "Renaming $old_filename to $new_filename\n";
1008 rename ($old_filename, $new_filename)
1009 || print STDERR "Rename failed: $!\n";
1010 return;
1011 }
1012 # try bdb
1013 $old_filename = "$filename_no_ext.bdb";
1014 if (-f $old_filename) {
1015 print STDERR "Renaming $old_filename to $new_filename\n";
1016 rename ($old_filename, $new_filename)
1017 || print STDERR "Rename failed: $!\n";
1018 return;
1019 }
1020}
1021
1022sub os_dir() {
1023
1024 my $gsdlarch = "";
1025 if(defined $ENV{'GSDLARCH'}) {
1026 $gsdlarch = $ENV{'GSDLARCH'};
1027 }
1028 return $ENV{'GSDLOS'}.$gsdlarch;
1029}
1030
1031# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1032# By default, /greenstone3 for GS3 or /greenstone for GS2.
1033sub get_greenstone_url_prefix() {
1034 # if already set on a previous occasion, just return that
1035 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1036 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1037
1038 my ($configfile, $urlprefix, $defaultUrlprefix);
1039 my @propertynames = ();
1040
1041 if($ENV{'GSDL3SRCHOME'}) {
1042 $defaultUrlprefix = "/greenstone3";
1043 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1044 push(@propertynames, qw/path\s*\=/);
1045 } else {
1046 $defaultUrlprefix = "/greenstone";
1047 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1048 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1049 }
1050
1051 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1052
1053 if(!$urlprefix) { # no values found for URL prefix, use default values
1054 $urlprefix = $defaultUrlprefix;
1055 } else {
1056 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1057 $urlprefix =~ s/^\///; # remove the starting slash
1058 my @dirs = split(/(\\|\/)/, $urlprefix);
1059 $urlprefix = shift(@dirs);
1060
1061 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1062 $urlprefix = "/$urlprefix";
1063 }
1064 }
1065
1066 # set for the future
1067 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1068# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1069 return $urlprefix;
1070}
1071
1072
1073# Given a config file (xml or java properties file) and a list/array of regular expressions
1074# that represent property names to match on, this function will return the value for the 1st
1075# matching property name. If the return value is undefined, no matching property was found.
1076sub extract_propvalue_from_file() {
1077 my ($configfile, $propertynames) = @_;
1078
1079 my $value;
1080 unless(open(FIN, "<$configfile")) {
1081 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1082 return $value; # not initialised
1083 }
1084
1085 # Read the entire file at once, as one single line, then close it
1086 my $filecontents;
1087 {
1088 local $/ = undef;
1089 $filecontents = <FIN>;
1090 }
1091 close(FIN);
1092
1093 foreach my $regex (@$propertynames) {
1094 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1095 if($value) {
1096 $value =~ s/^\"//; # remove any startquotes
1097 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1098 last; # found value for a matching property, break from loop
1099 }
1100 }
1101
1102 return $value;
1103}
1104
1105# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1106# given that perllib is in @INC in order to invoke this subroutine.
1107# Call as follows -- after setting up INC to include perllib and
1108# after setting up GSDLHOME and GSDLOS:
1109#
1110# require util;
1111# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1112#
1113sub setup_greenstone_env() {
1114 my ($GSDLHOME, $GSDLOS) = @_;
1115
1116 #my %env_map = ();
1117 # Get the localised ENV settings of running a localised source setup.bash
1118 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1119 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1120 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1121 if($GSDLOS =~ m/windows/i) {
1122 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1123 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1124 }
1125 if (!open(PIN, "$perl_command |")) {
1126 print STDERR ("Unable to execute command: $perl_command. $!\n");
1127 }
1128
1129 while (defined (my $perl_output_line = <PIN>)) {
1130 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1131 #$env_map{$key}=$value;
1132 $ENV{$key}=$value;
1133 }
1134 close (PIN);
1135
1136 # If any keys in $ENV don't occur in Greenstone's localised env
1137 # (stored in $env_map), delete those entries from $ENV
1138 #foreach $key (keys %ENV) {
1139 # if(!defined $env_map{$key}) {
1140 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1141 # delete $ENV{$key}; # del $ENV(key, value) pair
1142 # }
1143 #}
1144 #undef %env_map;
1145}
1146
1147sub get_perl_exec() {
1148 my $perl_exec = $^X; # may return just "perl"
1149
1150 if($ENV{'PERLPATH'}) {
1151 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1152 if($ENV{'GSDLOS'} =~ m/windows/) {
1153 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1154 } else {
1155 $perl_exec = "$ENV{'PERLPATH'}/perl";
1156 }
1157 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1158 # containing the full path to the current perl executable we're using
1159 $perl_exec = $Config{perlpath}; # configured path for perl
1160 if (!-e $perl_exec) { # may not point to location on this machine
1161 $perl_exec = $^X; # may return just "perl"
1162 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1163 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1164 }
1165 }
1166 }
1167
1168 return $perl_exec;
1169}
1170
1171# returns the path to the java command in the JRE included with GS (if any),
1172# quoted to safeguard any spaces in this path, otherwise a simple java
1173# command is returned which assumes and will try for a system java.
1174sub get_java_command {
1175 my $java = "java";
1176 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1177 # after running setup.bat or from GLI which also runs setup.bat
1178 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1179 if(-d $java_bin) {
1180 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1181 $java = "\"".$java."\""; # quoted to preserve spaces in path
1182 }
1183 }
1184 return $java;
1185}
1186
1187
1188# Given the qualified collection name (colgroup/collection),
1189# returns the collection and colgroup parts
1190sub get_collection_parts {
1191 # http://perldoc.perl.org/File/Basename.html
1192 # my($filename, $directories, $suffix) = fileparse($path);
1193 # "$directories contains everything up to and including the last directory separator in the $path
1194 # including the volume (if applicable). The remainder of the $path is the $filename."
1195 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1196
1197 my $qualified_collection = shift(@_);
1198
1199 # Since activate.pl can be launched from the command-line, including by a user,
1200 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1201 # Also allow for the accidental inclusion of multiple slashes
1202 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1203
1204 if(!defined $collection) {
1205 $collection = $colgroup;
1206 $colgroup = "";
1207 }
1208 return ($collection, $colgroup);
1209}
1210
1211# work out the "collectdir/collection" location
1212sub resolve_collection_dir {
1213 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1214
1215 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1216
1217 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1218 $collect_dir = &util::get_working_collect_dir($site);
1219 }
1220
1221 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1222}
1223
1224# work out the full path to "collect" of this greenstone 2/3 installation
1225sub get_working_collect_dir {
1226 my ($site) = @_;
1227
1228 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collect dir exists
1229 return $ENV{'GSDLCOLLECTDIR'};
1230 }
1231
1232 elsif (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1233 return &FileUtils::filenameConcatenate(&util::get_site_dir($site),"collect");
1234 }
1235
1236 else { # get default collect dir for GS2
1237 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1238 }
1239}
1240
1241# returns path to current GS3 site.
1242# Returns "" if $site parameter is empty string, undefined or otherwise false
1243sub get_site_dir {
1244 my ($site) = @_;
1245
1246 if (defined $ENV{'GSDL3HOME'}) {
1247 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site); # web folder
1248 }
1249 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1250 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site);
1251 }
1252 else { # site has no meaning, return "" (covers the case of this subroutine having been called for a GS2 situation)
1253 return "";
1254 }
1255}
1256
1257# returns path to GSDL3HOME if GS3, or GSDLHOME if GS2
1258sub get_gs_home {
1259 my ($site) = @_;
1260
1261 if (defined $site && $site) { # not undefined or empty string, so GS3 case
1262 if (defined $ENV{'GSDL3HOME'}) {
1263 return $ENV{'GSDL3HOME'};
1264 } else {
1265 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web");
1266 }
1267 } else {
1268 return $ENV{'GSDLHOME'};
1269 }
1270}
1271
1272sub is_abs_path {
1273 my ($path) = @_;
1274
1275 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1276 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1277
1278 return 0;
1279
1280 # File::Spec loads the module appropriate for the OS, but abs paths in archiveinf-doc.xml may be generated on another OS to this one
1281 #return File::Spec->file_name_is_absolute( $path );
1282}
1283
1284# returns an array of the FULL PATHS to the specific collection given, to the general collect_dir,
1285# to the site directory if gs3 and to the greenstone home folder (GSDLHOME or GSDL3HOME).
1286# These elements are returned in this ORDER, in order to allow matching from longest to shortest path.
1287sub get_common_gs_paths {
1288
1289 my ($site, $qualified_collection_name) = @_;
1290
1291 my $collect_dir = &util::get_working_collect_dir($site);
1292 my $collection_path = "";
1293 if($qualified_collection_name ne "") {
1294 $collection_path = &util::resolve_collection_dir($collect_dir, $qualified_collection_name, $site);
1295 }
1296
1297 my $site_dir = &util::get_site_dir($site); # returns "" for GS2
1298 my $gs_home = &util::get_gs_home(); # GSDLHOME or GSDL3HOME folder
1299
1300 return ($collection_path, $collect_dir, $site_dir, $gs_home); # list in this order: from longest to shortest path
1301 #my @gs_paths = ($collection_path, $collect_dir, $site_dir, $gs_home); # list in this order: from longest to shortest path
1302 #return @gs_paths;
1303}
1304
1305# This subroutine is for improving portability of Greenstone collections from one OS to another,
1306# to be used to convert absolute paths going into db files into paths with placeholders instead.
1307# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1308# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1309# path with a placeholder to match.
1310# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1311# the path to the current (specific) collection, the path to the general GS collect directory,
1312# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1313# The longest matching prefix will be replaced with the equivalent placeholder:
1314# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1315sub abspath_to_placeholders {
1316 my $path = shift(@_); # path to convert from absolute to one with placeholders
1317
1318 my ($collection_path, $collect_dir, $site_dir, $gs_home) = @_;
1319
1320 return $path unless is_abs_path($path); # path is relative
1321
1322 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1323
1324 my @gs_paths = ($collection_path, $collect_dir, $site_dir, $gs_home); # list in this order: from longest to shortest path
1325
1326 my %placeholder_map = ($gs_home => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1327 $site_dir => '@SITEHOME@',
1328 $collect_dir => '@COLLECTHOME@',
1329 $collection_path => '@THISCOLLECTPATH@'
1330 );
1331
1332
1333 # The sequence of elements in @gs_paths matters
1334 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1335 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1336
1337 foreach my $gs_path (@gs_paths) {
1338 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1339
1340 next if($re_path eq ""); # site can be empty
1341
1342 if($path =~ m/^$re_path/) { # case sensitive or not for OS?
1343
1344 my $placeholder = $placeholder_map{$gs_path}; # get the placeholder to replace the matched path with
1345
1346 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1347 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1348 last; # done
1349 }
1350 }
1351
1352 return $path;
1353}
1354
1355# Function that does the reverse of the util::abspath_to_placeholders subroutine
1356# Once again, call this with the values returned from util::get_common_gs_paths
1357sub placeholders_to_abspath {
1358 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1359
1360 my ($collection_path, $collect_dir, $site_dir, $gs_home) = @_;
1361
1362 return $path if($path !~ m/@/); # path contains no placeholders
1363
1364 # replace placeholders with gs prefixes
1365
1366 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1367 # but listed here from longest to shortest once placeholders are have been resolved
1368
1369 my %placeholder_to_gspath_map = ('@GSDLHOME@' => $gs_home, # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1370 '@SITEHOME@' => $site_dir,
1371 '@COLLECTHOME@' => $collect_dir,
1372 '@THISCOLLECTPATH@' => $collection_path
1373 );
1374
1375
1376 foreach my $placeholder (@placeholders) {
1377 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1378
1379 next if($gs_path eq ""); # no value for sitehome for GS2
1380
1381 #my $sep = ($ENV{'GSDLOS'} eq "windows") ? "\\" : "/";
1382 #$gs_path = $gs_path . "$sep" if ($gs_path !~ m/$sep$/); # add in any trailing separator to the path prefix
1383
1384 if($path =~ m/^$placeholder/) {
1385 $path =~ s/^$placeholder/$gs_path/;
1386 last; # done
1387 }
1388 }
1389
1390 return $path;
1391}
1392
1393# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1394# a directory containing sequentially numbered images.
1395sub create_itemfile
1396{
1397 my ($output_dir, $convert_basename, $convert_to) = @_;
1398 my $page_num = "";
1399
1400 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1401 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1402 closedir DIR;
1403
1404 # Sort files in the directory by page_num
1405 sub page_number {
1406 my ($dir) = @_;
1407 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1408# my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1409
1410 $pagenum = 1 unless defined $pagenum;
1411 return $pagenum;
1412 }
1413
1414 # sort the files in the directory in the order of page_num rather than lexically.
1415 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1416
1417 # work out if the numbering of the now sorted image files starts at 0 or not
1418 # by checking the number of the first _image_ file (skipping item files)
1419 my $starts_at_0 = 0;
1420 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1421 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1422 $starts_at_0 = 1;
1423 }
1424
1425 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1426 my $item_fh;
1427 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1428 print $item_fh "<PagedDocument>\n";
1429
1430 foreach my $file (@dir_files){
1431 if ($file !~ /\.item/i){
1432 $page_num = page_number($file);
1433 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1434 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1435 }
1436 }
1437
1438 print $item_fh "</PagedDocument>\n";
1439 &FileUtils::closeFileHandle($item_file, \$item_fh);
1440 return $item_file;
1441}
1442
1443
1444## @function augmentINC()
1445#
1446# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1447# @param $new_path The path to add
1448# @author jmt12
1449#
1450sub augmentINC
1451{
1452 my ($new_path) = @_;
1453 my $did_add_path = 0;
1454 # might need to be replaced with FileUtils::directoryExists() call eventually
1455 if (-d $new_path)
1456 {
1457 my $did_find_path = 0;
1458 foreach my $existing_path (@INC)
1459 {
1460 if ($existing_path eq $new_path)
1461 {
1462 $did_find_path = 1;
1463 last;
1464 }
1465 }
1466 if (!$did_find_path)
1467 {
1468 unshift(@INC, $new_path);
1469 $did_add_path = 1;
1470 }
1471 }
1472 return $did_add_path;
1473}
1474## augmentINC()
1475
1476
14771;
Note: See TracBrowser for help on using the repository browser.