source: main/trunk/greenstone2/perllib/util.pm@ 28211

Last change on this file since 28211 was 28211, checked in by ak19, 11 years ago

No more absolute paths in archiveinf-doc.gdb and archiveinf-src.gdb

  • Property svn:keywords set to Author Date Id Revision
File size: 47.2 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29use FileUtils;
30
31use Encode;
32use File::Copy;
33use File::Basename;
34# Config for getting the perlpath in the recommended way, though it uses paths that are
35# hard-coded into the Config file that's generated upon configuring and compiling perl.
36# $^X works better in some cases to return the path to perl used to launch the script,
37# but if launched with plain "perl" (no full-path), that will be just what it returns.
38use Config;
39# New module for file related utility functions - intended as a
40# placeholder for an extension that allows a variety of different
41# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
42use FileUtils;
43
44# removes files (but not directories)
45sub rm {
46 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
47 return &FileUtils::removeFiles(@_);
48}
49
50# recursive removal
51sub filtered_rm_r {
52 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
53 return &FileUtils::removeFilesFiltered(@_);
54}
55
56# recursive removal
57sub rm_r {
58 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
59 return &FileUtils::removeFilesRecursive(@_);
60}
61
62# moves a file or a group of files
63sub mv {
64 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
65 return &FileUtils::moveFiles(@_);
66}
67
68# Move the contents of source directory into target directory
69# (as opposed to merely replacing target dir with the src dir)
70# This can overwrite any files with duplicate names in the target
71# but other files and folders in the target will continue to exist
72sub mv_dir_contents {
73 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
74 return &FileUtils::moveDirectoryContents(@_);
75}
76
77# copies a file or a group of files
78sub cp {
79 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
80 return &FileUtils::copyFiles(@_);
81}
82
83# recursively copies a file or group of files
84# syntax: cp_r (sourcefiles, destination directory)
85# destination must be a directory - to copy one file to
86# another use cp instead
87sub cp_r {
88 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
89 return &FileUtils::copyFilesRecursive(@_);
90}
91
92# recursively copies a file or group of files
93# syntax: cp_r (sourcefiles, destination directory)
94# destination must be a directory - to copy one file to
95# another use cp instead
96sub cp_r_nosvn {
97 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
98 return &FileUtils::copyFilesRecursiveNoSVN(@_);
99}
100
101# copies a directory and its contents, excluding subdirectories, into a new directory
102sub cp_r_toplevel {
103 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
104 return &FileUtils::recursiveCopyTopLevel(@_);
105}
106
107sub mk_dir {
108 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
109 return &FileUtils::makeDirectory(@_);
110}
111
112# in case anyone cares - I did some testing (using perls Benchmark module)
113# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
114# slightly faster (surprisingly) - Stefan.
115sub mk_all_dir {
116 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
117 return &FileUtils::makeAllDirectories(@_);
118}
119
120# make hard link to file if supported by OS, otherwise copy the file
121sub hard_link {
122 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
123 return &FileUtils::hardLink(@_);
124}
125
126# make soft link to file if supported by OS, otherwise copy file
127sub soft_link {
128 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
129 return &FileUtils::softLink(@_);
130}
131
132# Primarily for filenames generated by processing
133# content of HTML files (which are mapped to UTF-8 internally)
134#
135# To turn this into an octet string that really exists on the file
136# system:
137# 1. don't need to do anything special for Unix-based systems
138# (as underlying file system is byte-code)
139# 2. need to map to short DOS filenames for Windows
140
141sub utf8_to_real_filename
142{
143 my ($utf8_filename) = @_;
144
145 my $real_filename;
146
147 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
148 require Win32;
149
150 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
151
152 my $unicode_filename = decode("utf8",$utf8_filename);
153 $real_filename = Win32::GetShortPathName($unicode_filename);
154 }
155 else {
156 $real_filename = $utf8_filename;
157 }
158
159 return $real_filename;
160}
161
162sub fd_exists {
163 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
164 return &FileUtils::fileTest(@_);
165}
166
167sub file_exists {
168 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
169 return &FileUtils::fileExists(@_);
170}
171
172sub dir_exists {
173 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
174 return &FileUtils::directoryExists(@_);
175}
176
177# updates a copy of a directory in some other part of the filesystem
178# verbosity settings are: 0=low, 1=normal, 2=high
179# both $fromdir and $todir should be absolute paths
180sub cachedir {
181 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
182 return &FileUtils::synchronizeDirectories(@_);
183}
184
185# this function returns -1 if either file is not found
186# assumes that $file1 and $file2 are absolute file names or
187# in the current directory
188# $file2 is allowed to be newer than $file1
189sub differentfiles {
190 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
191 return &FileUtils::differentFiles(@_);
192}
193
194
195sub get_tmp_filename
196{
197 my $file_ext = shift(@_) || undef;
198
199 my $opt_dot_file_ext = "";
200 if (defined $file_ext) {
201 if ($file_ext !~ m/\./) {
202 # no dot, so needs one added in at start
203 $opt_dot_file_ext = ".$file_ext"
204 }
205 else {
206 # allow for "extensions" such as _metadata.txt to be handled
207 # gracefully
208 $opt_dot_file_ext = $file_ext;
209 }
210 }
211
212 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
213 &FileUtils::makeAllDirectories ($tmpdir) unless -e $tmpdir;
214
215 my $count = 1000;
216 my $rand = int(rand $count);
217 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
218
219 while (-e $full_tmp_filename) {
220 $rand = int(rand $count);
221 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
222 $count++;
223 }
224
225 return $full_tmp_filename;
226}
227
228# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
229# belong with that function. They help ensure the timestamped tmp folders generated are unique.
230my $previous_timestamp = undef;
231my $previous_timestamp_f = 0; # frequency
232
233sub get_timestamped_tmp_folder
234{
235
236 my $tmp_dirname;
237 if(defined $ENV{'GSDLCOLLECTDIR'}) {
238 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
239 } elsif(defined $ENV{'GSDLHOME'}) {
240 $tmp_dirname = $ENV{'GSDLHOME'};
241 } else {
242 return undef;
243 }
244
245 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
246 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
247
248 # add the timestamp into the path otherwise we can run into problems
249 # if documents have the same name
250 my $timestamp = time;
251
252 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
253 $previous_timestamp_f = 0;
254 $previous_timestamp = $timestamp;
255 } else {
256 $previous_timestamp_f++;
257 }
258
259 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
260 $tmp_dirname = $time_tmp_dirname;
261 my $i = $previous_timestamp_f;
262
263 if($previous_timestamp_f > 0) {
264 $tmp_dirname = $time_tmp_dirname."_".$i;
265 $i++;
266 }
267 while (-e $tmp_dirname) {
268 $tmp_dirname = $time_tmp_dirname."_".$i;
269 $i++;
270 }
271 &FileUtils::makeDirectory($tmp_dirname);
272
273 return $tmp_dirname;
274}
275
276sub get_timestamped_tmp_filename_in_collection
277{
278
279 my ($input_filename, $output_ext) = @_;
280 # derive tmp filename from input filename
281 my ($tailname, $dirname, $suffix)
282 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
283
284 # softlink to collection tmp dir
285 my $tmp_dirname = &util::get_timestamped_tmp_folder();
286 $tmp_dirname = $dirname unless defined $tmp_dirname;
287
288 # following two steps copied from ConvertBinaryFile
289 # do we need them?? can't use them as is, as they use plugin methods.
290
291 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
292
293 # URLEncode this since htmls with images where the html filename is utf8 don't seem
294 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
295 # files on the filesystem.
296 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
297 if (defined $output_ext) {
298 $output_ext = ".$output_ext"; # add the dot
299 } else {
300 $output_ext = $suffix;
301 }
302 $output_ext= lc($output_ext);
303 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
304
305 return $tmp_filename;
306}
307
308sub get_toplevel_tmp_dir
309{
310 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
311}
312
313
314sub filename_to_regex {
315 my $filename = shift (@_);
316
317 # need to make single backslashes double so that regex works
318 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
319
320 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
321 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
322 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
323 $filename =~ s@\(@\\(@g; # escape brackets
324 $filename =~ s@\)@\\)@g; # escape brackets
325 $filename =~ s@\[@\\[@g; # escape brackets
326 $filename =~ s@\]@\\]@g; # escape brackets
327
328 return $filename;
329}
330
331sub unregex_filename {
332 my $filename = shift (@_);
333
334 # need to put doubled backslashes for regex back to single
335 $filename =~ s/\\\./\./g; # remove RE syntax for .
336 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
337 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
338 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
339 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
340
341 # \\ goes to \
342 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
343 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
344 return $filename;
345}
346
347sub filename_cat {
348 # I've disabled this warning for now, as every Greenstone perl
349 # script seems to make use of this function and so you drown in a
350 # sea of deprecated warnings [jmt12]
351# warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
352 return &FileUtils::filenameConcatenate(@_);
353}
354
355
356sub pathname_cat {
357 my $first_path = shift(@_);
358 my (@pathnames) = @_;
359
360 # If first_path is not null or empty, then add it back into the list
361 if (defined $first_path && $first_path =~ /\S/) {
362 unshift(@pathnames, $first_path);
363 }
364
365 my $join_char;
366 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
367 $join_char = ";";
368 } else {
369 $join_char = ":";
370 }
371
372 my $pathname = join($join_char, @pathnames);
373
374 # remove duplicate slashes
375 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
376 $pathname =~ s/[\\\/]+/\\/g;
377 } else {
378 $pathname =~ s/[\/]+/\//g;
379 # DB: want a pathname abc\de.html to remain like this
380 }
381
382 return $pathname;
383}
384
385
386sub tidy_up_oid {
387 my ($OID) = @_;
388 if ($OID =~ /\./) {
389 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
390 $OID =~ s/\.//g; #remove any periods
391 }
392 if ($OID =~ /^\s.*\s$/) {
393 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
394 # remove starting and trailing whitespace
395 $OID =~ s/^\s+//;
396 $OID =~ s/\s+$//;
397 }
398 if ($OID =~ /^[\d]*$/) {
399 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
400 $OID = "D" . $OID;
401 }
402
403 return $OID;
404}
405
406sub envvar_prepend {
407 my ($var,$val) = @_;
408
409 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
410 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
411
412 # do not prepend any value/path that's already in the environment variable
413
414 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
415 if (!defined($ENV{$var})) {
416 $ENV{$var} = "$val";
417 }
418 elsif($ENV{$var} !~ m/$escaped_val/) {
419 $ENV{$var} = "$val".$pathsep.$ENV{$var};
420 }
421}
422
423sub envvar_append {
424 my ($var,$val) = @_;
425
426 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
427 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
428
429 # do not append any value/path that's already in the environment variable
430
431 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
432 if (!defined($ENV{$var})) {
433 $ENV{$var} = "$val";
434 }
435 elsif($ENV{$var} !~ m/$escaped_val/) {
436 $ENV{$var} = $ENV{$var}.$pathsep."$val";
437 }
438}
439
440
441# splits a filename into a prefix and a tail extension using the tail_re, or
442# if that fails, splits on the file_extension . (dot)
443sub get_prefix_and_tail_by_regex {
444
445 my ($filename,$tail_re) = @_;
446
447 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
448 if ((!defined $file_prefix) || (!defined $file_ext)) {
449 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
450 }
451
452 return ($file_prefix,$file_ext);
453}
454
455# get full path and file only path from a base_dir (which may be empty) and
456# file (which may contain directories)
457sub get_full_filenames {
458 my ($base_dir, $file) = @_;
459
460 my $filename_full_path = $file;
461 # add on directory if present
462 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
463
464 my $filename_no_path = $file;
465
466 # remove directory if present
467 $filename_no_path =~ s/^.*[\/\\]//;
468 return ($filename_full_path, $filename_no_path);
469}
470
471# returns the path of a file without the filename -- ie. the directory the file is in
472sub filename_head {
473 my $filename = shift(@_);
474
475 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
476 $filename =~ s/[^\\\\]*$//;
477 }
478 else {
479 $filename =~ s/[^\\\/]*$//;
480 }
481
482 return $filename;
483}
484
485
486
487# returns 1 if filename1 and filename2 point to the same
488# file or directory
489sub filenames_equal {
490 my ($filename1, $filename2) = @_;
491
492 # use filename_cat to clean up trailing slashes and
493 # multiple slashes
494 $filename1 = &FileUtils::filenameConcatenate($filename1);
495 $filename2 = &FileUtils::filenameConcatenate($filename2);
496
497 # filenames not case sensitive on windows
498 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
499 $filename1 =~ tr/[A-Z]/[a-z]/;
500 $filename2 =~ tr/[A-Z]/[a-z]/;
501 }
502 return 1 if $filename1 eq $filename2;
503 return 0;
504}
505
506# If filename is relative to within_dir, returns the relative path of filename to that directory
507# with slashes in the filename returned as they were in the original (absolute) filename.
508sub filename_within_directory
509{
510 my ($filename,$within_dir) = @_;
511
512 if ($within_dir !~ m/[\/\\]$/) {
513 my $dirsep = &util::get_dirsep();
514 $within_dir .= $dirsep;
515 }
516
517 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
518 if ($filename =~ m/^$within_dir(.*)$/) {
519 $filename = $1;
520 }
521
522 return $filename;
523}
524
525# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
526# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
527# The subpath returned will also be a URL type filename.
528sub filename_within_directory_url_format
529{
530 my ($filename,$within_dir) = @_;
531
532 # convert parameters only to / slashes if Windows
533
534 my $filename_urlformat = &filepath_to_url_format($filename);
535 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
536
537 #if ($within_dir_urlformat !~ m/\/$/) {
538 # make sure directory ends with a slash
539 #$within_dir_urlformat .= "/";
540 #}
541
542 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
543
544 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
545
546 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
547 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
548 $filename_urlformat = $1;
549 }
550
551 return $filename_urlformat;
552}
553
554# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
555# since on Linux it doesn't represent a file separator but an escape char).
556sub filepath_to_url_format
557{
558 my ($filepath) = @_;
559 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
560 # Only need to worry about Windows, as Unix style directories already in url-format
561 # Convert Windows style \ => /
562 $filepath =~ s@\\@/@g;
563 }
564 return $filepath;
565}
566
567# regex filepaths on windows may include \\ as path separator. Convert \\ to /
568sub filepath_regex_to_url_format
569{
570 my ($filepath) = @_;
571 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
572 # Only need to worry about Windows, as Unix style directories already in url-format
573 # Convert Windows style \\ => /
574 $filepath =~ s@\\\\@/@g;
575 }
576 return $filepath;
577
578}
579
580# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
581# and ignores trailing /
582# returns (file, dirs) dirs will be empty if no subdirs
583sub url_fileparse
584{
585 my ($filepath) = @_;
586 # remove trailing /
587 $filepath =~ s@/$@@;
588 if ($filepath !~ m@/@) {
589 return ($filepath, "");
590 }
591 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
592 return ($file, $dirs);
593
594}
595
596
597sub filename_within_collection
598{
599 my ($filename) = @_;
600
601 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
602
603 if (defined $collect_dir) {
604
605 # if from within GSDLCOLLECTDIR, then remove directory prefix
606 # so source_filename is realative to it. This is done to aid
607 # portability, i.e. the collection can be moved to somewhere
608 # else on the file system and the archives directory will still
609 # work. This is needed, for example in the applet version of
610 # GLI where GSDLHOME/collect on the server will be different to
611 # the collect directory of the remove user. Of course,
612 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
613 # it back into a full pathname.
614
615 $filename = filename_within_directory($filename,$collect_dir);
616 }
617
618 return $filename;
619}
620
621sub prettyprint_file
622{
623 my ($base_dir,$file,$gli) = @_;
624
625 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
626
627 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
628 require Win32;
629
630 # For some reason base_dir in the form c:/a/b/c
631 # This leads to confusion later on, so turn it back into
632 # the more usual Windows form
633 $base_dir =~ s/\//\\/g;
634 my $long_base_dir = Win32::GetLongPathName($base_dir);
635 my $long_full_path = Win32::GetLongPathName($filename_full_path);
636
637 $file = filename_within_directory($long_full_path,$long_base_dir);
638 $file = encode("utf8",$file) if ($gli);
639 }
640
641 return $file;
642}
643
644
645sub upgrade_if_dos_filename
646{
647 my ($filename_full_path,$and_encode) = @_;
648
649 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
650 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
651 # to its long (Windows) version
652 my $long_filename = Win32::GetLongPathName($filename_full_path);
653 if (defined $long_filename) {
654 $filename_full_path = $long_filename;
655 }
656 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
657 $filename_full_path =~ s/^(.):/\u$1:/;
658 if ((defined $and_encode) && ($and_encode)) {
659 $filename_full_path = encode("utf8",$filename_full_path);
660 }
661 }
662
663 return $filename_full_path;
664}
665
666
667sub downgrade_if_dos_filename
668{
669 my ($filename_full_path) = @_;
670
671 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
672 require Win32;
673
674 # Ensure the given long Windows filename is in a form that can
675 # be opened by Perl => convert it to a short DOS-like filename
676
677 my $short_filename = Win32::GetShortPathName($filename_full_path);
678 if (defined $short_filename) {
679 $filename_full_path = $short_filename;
680 }
681 # Make sure initial drive letter is lower-case (to fit in
682 # with rest of Greenstone)
683 $filename_full_path =~ s/^(.):/\u$1:/;
684 }
685
686 return $filename_full_path;
687}
688
689sub block_filename
690{
691 my ($block_hash,$filename) = @_;
692
693 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
694
695 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
696 my $lower_filename = lc($filename);
697 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
698# my $lower_drive = $filename;
699# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
700
701# my $upper_drive = $filename;
702# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
703#
704# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
705# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
706 }
707 else {
708 $block_hash->{'file_blocks'}->{$filename} = 1;
709 }
710}
711
712
713sub filename_is_absolute
714{
715 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
716 return &FileUtils::isFilenameAbsolute(@_);
717}
718
719
720## @method make_absolute()
721#
722# Ensure the given file path is absolute in respect to the given base path.
723#
724# @param $base_dir A string denoting the base path the given dir must be
725# absolute to.
726# @param $dir The directory to be made absolute as a string. Note that the
727# dir may already be absolute, in which case it will remain
728# unchanged.
729# @return The now absolute form of the directory as a string.
730#
731# @author John Thompson, DL Consulting Ltd.
732# @copy 2006 DL Consulting Ltd.
733#
734#used in buildcol.pl, doesn't work for all cases --kjdon
735sub make_absolute {
736
737 my ($base_dir, $dir) = @_;
738### print STDERR "dir = $dir\n";
739 $dir =~ s/[\\\/]+/\//g;
740 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
741 $dir =~ s|^/tmp_mnt||;
742 1 while($dir =~ s|/[^/]*/\.\./|/|g);
743 $dir =~ s|/[.][.]?/|/|g;
744 $dir =~ tr|/|/|s;
745### print STDERR "dir = $dir\n";
746
747 return $dir;
748}
749## make_absolute() ##
750
751sub get_dirsep {
752
753 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
754 return "\\";
755 } else {
756 return "\/";
757 }
758}
759
760sub get_os_dirsep {
761
762 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
763 return "\\\\";
764 } else {
765 return "\\\/";
766 }
767}
768
769sub get_re_dirsep {
770
771 return "\\\\|\\\/";
772}
773
774
775sub get_dirsep_tail {
776 my ($filename) = @_;
777
778 # returns last part of directory or filename
779 # On unix e.g. a/b.d => b.d
780 # a/b/c => c
781
782 my $dirsep = get_re_dirsep();
783 my @dirs = split (/$dirsep/, $filename);
784 my $tail = pop @dirs;
785
786 # - caused problems under windows
787 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
788
789 return $tail;
790}
791
792
793# if this is running on windows we want binaries to end in
794# .exe, otherwise they don't have to end in any extension
795sub get_os_exe {
796 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
797 return "";
798}
799
800
801# test to see whether this is a big or little endian machine
802sub is_little_endian
803{
804 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
805 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
806 # Otherwise, it's little endian
807
808 #return 0 if $^O =~ /^darwin$/i;
809 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
810
811 # Going back to stating exactly whether the machine is little endian
812 # or big endian, without any special case for Macs. Since for rata it comes
813 # back with little endian and for shuttle with bigendian.
814 return (ord(substr(pack("s",1), 0, 1)) == 1);
815}
816
817
818# will return the collection name if successful, "" otherwise
819sub use_collection {
820 my ($collection, $collectdir) = @_;
821
822 if (!defined $collectdir || $collectdir eq "") {
823 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
824 }
825
826 if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
827 $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
828 }
829
830 # get and check the collection
831 if (!defined($collection) || $collection eq "") {
832 if (defined $ENV{'GSDLCOLLECTION'}) {
833 $collection = $ENV{'GSDLCOLLECTION'};
834 } else {
835 print STDOUT "No collection specified\n";
836 return "";
837 }
838 }
839
840 if ($collection eq "modelcol") {
841 print STDOUT "You can't use modelcol.\n";
842 return "";
843 }
844
845 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
846 # are defined
847 $ENV{'GSDLCOLLECTION'} = $collection;
848 $ENV{'GSDLCOLLECTHOME'} = $collectdir;
849 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
850
851 # make sure this collection exists
852 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
853 print STDOUT "Invalid collection ($collection).\n";
854 return "";
855 }
856
857 # everything is ready to go
858 return $collection;
859}
860
861sub get_current_collection_name {
862 return $ENV{'GSDLCOLLECTION'};
863}
864
865
866# will return the collection name if successful, "" otherwise.
867# Like use_collection (above) but for greenstone 3 (taking account of site level)
868
869sub use_site_collection {
870 my ($site, $collection, $collectdir) = @_;
871
872 if (!defined $collectdir || $collectdir eq "") {
873 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
874 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
875 }
876
877 if (defined $ENV{'GSDL3HOME'}) {
878 $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};
879 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
880 } elsif (defined $ENV{'GSDL3SRCHOME'}) {
881 $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web");
882 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
883 } else {
884 print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
885 }
886
887 # collectdir explicitly set by this point (using $site variable if required).
888 # Can call "old" gsdl2 use_collection now.
889
890 return use_collection($collection,$collectdir);
891}
892
893
894
895sub locate_config_file
896{
897 my ($file) = @_;
898
899 my $locations = locate_config_files($file);
900
901 return shift @$locations; # returns undef if 'locations' is empty
902}
903
904
905sub locate_config_files
906{
907 my ($file) = @_;
908
909 my @locations = ();
910
911 if (-e $file) {
912 # Clearly specified (most likely full filename)
913 # No need to hunt in 'etc' directories, return value unchanged
914 push(@locations,$file);
915 }
916 else {
917 # Check for collection specific one before looking in global GSDL 'etc'
918 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
919 my $test_collect_etc_filename
920 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
921
922 if (-e $test_collect_etc_filename) {
923 push(@locations,$test_collect_etc_filename);
924 }
925 }
926 my $test_main_etc_filename
927 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
928 if (-e $test_main_etc_filename) {
929 push(@locations,$test_main_etc_filename);
930 }
931 }
932
933 return \@locations;
934}
935
936
937sub hyperlink_text
938{
939 my ($text) = @_;
940
941 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
942 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
943
944 return $text;
945}
946
947
948# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
949# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
950sub is_dir_empty {
951 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
952 return &FileUtils::isDirectoryEmpty(@_);
953}
954
955# Returns the given filename converted using either URL encoding or base64
956# encoding, as specified by $rename_method. If the given filename has no suffix
957# (if it is just the tailname), then $no_suffix should be some defined value.
958# rename_method can be url, none, base64
959sub rename_file {
960 my ($filename, $rename_method, $no_suffix) = @_;
961
962 if(!$filename) { # undefined or empty string
963 return $filename;
964 }
965
966 if (!$rename_method) {
967 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
968 # Debugging information
969 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
970 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
971 $rename_method = "url";
972 } elsif($rename_method eq "none") {
973 return $filename; # would have already been renamed
974 }
975
976 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
977 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
978 ###$filename =~ s/ /_/g;
979
980 my ($tailname,$dirname,$suffix);
981 if($no_suffix) { # given a tailname, no suffix
982 ($tailname,$dirname) = File::Basename::fileparse($filename);
983 }
984 else {
985 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
986 }
987 if (!$suffix) {
988 $suffix = "";
989 }
990 # This breaks GLI matching extracted metadata to files in Enrich panel, as
991 # original is eg .JPG while gsdlsourcefilename ends up .jpg
992 # Not sure why it was done in first place...
993 #else {
994 #$suffix = lc($suffix);
995 #}
996
997 if ($rename_method eq "url") {
998 $tailname = &unicode::url_encode($tailname);
999 }
1000 elsif ($rename_method eq "base64") {
1001 $tailname = &unicode::base64_encode($tailname);
1002 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1003 }
1004
1005 $filename = "$tailname$suffix";
1006 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1007
1008 return $filename;
1009}
1010
1011
1012# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1013sub rename_ldb_or_bdb_file {
1014 my ($filename_no_ext) = @_;
1015
1016 my $new_filename = "$filename_no_ext.gdb";
1017 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1018 # try ldb
1019 my $old_filename = "$filename_no_ext.ldb";
1020
1021 if (-f $old_filename) {
1022 print STDERR "Renaming $old_filename to $new_filename\n";
1023 rename ($old_filename, $new_filename)
1024 || print STDERR "Rename failed: $!\n";
1025 return;
1026 }
1027 # try bdb
1028 $old_filename = "$filename_no_ext.bdb";
1029 if (-f $old_filename) {
1030 print STDERR "Renaming $old_filename to $new_filename\n";
1031 rename ($old_filename, $new_filename)
1032 || print STDERR "Rename failed: $!\n";
1033 return;
1034 }
1035}
1036
1037sub os_dir() {
1038
1039 my $gsdlarch = "";
1040 if(defined $ENV{'GSDLARCH'}) {
1041 $gsdlarch = $ENV{'GSDLARCH'};
1042 }
1043 return $ENV{'GSDLOS'}.$gsdlarch;
1044}
1045
1046# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1047# By default, /greenstone3 for GS3 or /greenstone for GS2.
1048sub get_greenstone_url_prefix() {
1049 # if already set on a previous occasion, just return that
1050 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1051 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1052
1053 my ($configfile, $urlprefix, $defaultUrlprefix);
1054 my @propertynames = ();
1055
1056 if($ENV{'GSDL3SRCHOME'}) {
1057 $defaultUrlprefix = "/greenstone3";
1058 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1059 push(@propertynames, qw/path\s*\=/);
1060 } else {
1061 $defaultUrlprefix = "/greenstone";
1062 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1063 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1064 }
1065
1066 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1067
1068 if(!$urlprefix) { # no values found for URL prefix, use default values
1069 $urlprefix = $defaultUrlprefix;
1070 } else {
1071 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1072 $urlprefix =~ s/^\///; # remove the starting slash
1073 my @dirs = split(/(\\|\/)/, $urlprefix);
1074 $urlprefix = shift(@dirs);
1075
1076 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1077 $urlprefix = "/$urlprefix";
1078 }
1079 }
1080
1081 # set for the future
1082 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1083# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1084 return $urlprefix;
1085}
1086
1087
1088# Given a config file (xml or java properties file) and a list/array of regular expressions
1089# that represent property names to match on, this function will return the value for the 1st
1090# matching property name. If the return value is undefined, no matching property was found.
1091sub extract_propvalue_from_file() {
1092 my ($configfile, $propertynames) = @_;
1093
1094 my $value;
1095 unless(open(FIN, "<$configfile")) {
1096 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1097 return $value; # not initialised
1098 }
1099
1100 # Read the entire file at once, as one single line, then close it
1101 my $filecontents;
1102 {
1103 local $/ = undef;
1104 $filecontents = <FIN>;
1105 }
1106 close(FIN);
1107
1108 foreach my $regex (@$propertynames) {
1109 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1110 if($value) {
1111 $value =~ s/^\"//; # remove any startquotes
1112 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1113 last; # found value for a matching property, break from loop
1114 }
1115 }
1116
1117 return $value;
1118}
1119
1120# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1121# given that perllib is in @INC in order to invoke this subroutine.
1122# Call as follows -- after setting up INC to include perllib and
1123# after setting up GSDLHOME and GSDLOS:
1124#
1125# require util;
1126# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1127#
1128sub setup_greenstone_env() {
1129 my ($GSDLHOME, $GSDLOS) = @_;
1130
1131 #my %env_map = ();
1132 # Get the localised ENV settings of running a localised source setup.bash
1133 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1134 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1135 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1136 if($GSDLOS =~ m/windows/i) {
1137 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1138 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1139 }
1140 if (!open(PIN, "$perl_command |")) {
1141 print STDERR ("Unable to execute command: $perl_command. $!\n");
1142 }
1143
1144 while (defined (my $perl_output_line = <PIN>)) {
1145 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1146 #$env_map{$key}=$value;
1147 $ENV{$key}=$value;
1148 }
1149 close (PIN);
1150
1151 # If any keys in $ENV don't occur in Greenstone's localised env
1152 # (stored in $env_map), delete those entries from $ENV
1153 #foreach $key (keys %ENV) {
1154 # if(!defined $env_map{$key}) {
1155 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1156 # delete $ENV{$key}; # del $ENV(key, value) pair
1157 # }
1158 #}
1159 #undef %env_map;
1160}
1161
1162sub get_perl_exec() {
1163 my $perl_exec = $^X; # may return just "perl"
1164
1165 if($ENV{'PERLPATH'}) {
1166 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1167 if($ENV{'GSDLOS'} =~ m/windows/) {
1168 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1169 } else {
1170 $perl_exec = "$ENV{'PERLPATH'}/perl";
1171 }
1172 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1173 # containing the full path to the current perl executable we're using
1174 $perl_exec = $Config{perlpath}; # configured path for perl
1175 if (!-e $perl_exec) { # may not point to location on this machine
1176 $perl_exec = $^X; # may return just "perl"
1177 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1178 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1179 }
1180 }
1181 }
1182
1183 return $perl_exec;
1184}
1185
1186# returns the path to the java command in the JRE included with GS (if any),
1187# quoted to safeguard any spaces in this path, otherwise a simple java
1188# command is returned which assumes and will try for a system java.
1189sub get_java_command {
1190 my $java = "java";
1191 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1192 # after running setup.bat or from GLI which also runs setup.bat
1193 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1194 if(-d $java_bin) {
1195 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1196 $java = "\"".$java."\""; # quoted to preserve spaces in path
1197 }
1198 }
1199 return $java;
1200}
1201
1202
1203# Given the qualified collection name (colgroup/collection),
1204# returns the collection and colgroup parts
1205sub get_collection_parts {
1206 # http://perldoc.perl.org/File/Basename.html
1207 # my($filename, $directories, $suffix) = fileparse($path);
1208 # "$directories contains everything up to and including the last directory separator in the $path
1209 # including the volume (if applicable). The remainder of the $path is the $filename."
1210 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1211
1212 my $qualified_collection = shift(@_);
1213
1214 # Since activate.pl can be launched from the command-line, including by a user,
1215 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1216 # Also allow for the accidental inclusion of multiple slashes
1217 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1218
1219 if(!defined $collection) {
1220 $collection = $colgroup;
1221 $colgroup = "";
1222 }
1223 return ($collection, $colgroup);
1224}
1225
1226# work out the "collectdir/collection" location
1227sub resolve_collection_dir {
1228 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1229
1230 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1231 return $ENV{'GSDLCOLLECTDIR'};
1232 }
1233
1234 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1235
1236 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1237 $collect_dir = &util::get_working_collect_dir($site);
1238 }
1239
1240 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1241}
1242
1243# work out the full path to "collect" of this greenstone 2/3 installation
1244sub get_working_collect_dir {
1245 my ($site) = @_;
1246
1247 if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1248 return $ENV{'GSDLCOLLECTHOME'};
1249 }
1250
1251 if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1252
1253 if (defined $ENV{'GSDL3HOME'}) {
1254 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1255 }
1256 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1257 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1258 }
1259 }
1260
1261 elsif (defined $ENV{'SITEHOME'}) { #&FileUtils::filenameConcatenate(&util::get_site_dir($site),"collect");
1262 return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1263 }
1264
1265 else { # get default collect dir for GS2
1266 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1267 }
1268}
1269
1270sub is_abs_path_this_os {
1271 my ($path) = @_;
1272
1273 # File::Spec loads the module appropriate for the OS
1274 return &FileUtils::isFilenameAbsolute(@_); #return File::Spec->file_name_is_absolute( $path );
1275}
1276
1277
1278sub is_abs_path_any_os {
1279 my ($path) = @_;
1280
1281 # We can have filenames in our DBs that were produced on other OS, so this method exists
1282 # to help identify absolute paths in such cases.
1283
1284 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1285 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1286
1287 return 0;
1288}
1289
1290
1291# This subroutine is for improving portability of Greenstone collections from one OS to another,
1292# to be used to convert absolute paths going into db files into paths with placeholders instead.
1293# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1294# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1295# path with a placeholder to match.
1296# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1297# the path to the current (specific) collection, the path to the general GS collect directory,
1298# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1299# The longest matching prefix will be replaced with the equivalent placeholder:
1300# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1301sub abspath_to_placeholders {
1302 my $path = shift(@_); # path to convert from absolute to one with placeholders
1303
1304 return $path unless is_abs_path_any_os($path); # path is relative
1305
1306 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1307
1308 my $site = $ENV{'SITEHOME'} ? $ENV{'SITEHOME'} : "";
1309 my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $site, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1310
1311 my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1312 $site => '@SITEHOME@',
1313 $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1314 $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1315 );
1316
1317
1318 # The sequence of elements in @gs_paths matters
1319 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1320 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1321
1322 foreach my $gs_path (@gs_paths) {
1323 next if($gs_path eq ""); # site can be empty
1324
1325 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1326
1327 if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1328
1329 my $placeholder = $placeholder_map{$gs_path}; # get the placeholder to replace the matched path with
1330
1331 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1332 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1333 last; # done
1334 }
1335 }
1336
1337 return $path;
1338}
1339
1340# Function that does the reverse of the util::abspath_to_placeholders subroutine
1341# Once again, call this with the values returned from util::get_common_gs_paths
1342sub placeholders_to_abspath {
1343 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1344
1345 return $path if($path !~ m/@/); # path contains no placeholders
1346
1347 # replace placeholders with gs prefixes
1348 my $site = $ENV{'SITEHOME'} ? $ENV{'SITEHOME'} : "";
1349 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1350 # but listed here from longest to shortest once placeholders are have been resolved
1351
1352 my %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'}, # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1353 '@SITEHOME@' => $site,
1354 '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1355 '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1356 );
1357
1358
1359 foreach my $placeholder (@placeholders) {
1360 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1361
1362 next if($gs_path eq ""); # no value for sitehome for GS2
1363
1364 #my $sep = ($ENV{'GSDLOS'} eq "windows") ? "\\" : "/";
1365 #$gs_path = $gs_path . "$sep" if ($gs_path !~ m/$sep$/); # add in any trailing separator to the path prefix
1366
1367 if($path =~ m/^$placeholder/) {
1368 $path =~ s/^$placeholder/$gs_path/;
1369 last; # done
1370 }
1371 }
1372
1373 return $path;
1374}
1375
1376# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1377# a directory containing sequentially numbered images.
1378sub create_itemfile
1379{
1380 my ($output_dir, $convert_basename, $convert_to) = @_;
1381 my $page_num = "";
1382
1383 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1384 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1385 closedir DIR;
1386
1387 # Sort files in the directory by page_num
1388 sub page_number {
1389 my ($dir) = @_;
1390 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1391# my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1392
1393 $pagenum = 1 unless defined $pagenum;
1394 return $pagenum;
1395 }
1396
1397 # sort the files in the directory in the order of page_num rather than lexically.
1398 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1399
1400 # work out if the numbering of the now sorted image files starts at 0 or not
1401 # by checking the number of the first _image_ file (skipping item files)
1402 my $starts_at_0 = 0;
1403 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1404 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1405 $starts_at_0 = 1;
1406 }
1407
1408 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1409 my $item_fh;
1410 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1411 print $item_fh "<PagedDocument>\n";
1412
1413 foreach my $file (@dir_files){
1414 if ($file !~ /\.item/i){
1415 $page_num = page_number($file);
1416 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1417 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1418 }
1419 }
1420
1421 print $item_fh "</PagedDocument>\n";
1422 &FileUtils::closeFileHandle($item_file, \$item_fh);
1423 return $item_file;
1424}
1425
1426
1427## @function augmentINC()
1428#
1429# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1430# @param $new_path The path to add
1431# @author jmt12
1432#
1433sub augmentINC
1434{
1435 my ($new_path) = @_;
1436 my $did_add_path = 0;
1437 # might need to be replaced with FileUtils::directoryExists() call eventually
1438 if (-d $new_path)
1439 {
1440 my $did_find_path = 0;
1441 foreach my $existing_path (@INC)
1442 {
1443 if ($existing_path eq $new_path)
1444 {
1445 $did_find_path = 1;
1446 last;
1447 }
1448 }
1449 if (!$did_find_path)
1450 {
1451 unshift(@INC, $new_path);
1452 $did_add_path = 1;
1453 }
1454 }
1455 return $did_add_path;
1456}
1457## augmentINC()
1458
1459
14601;
Note: See TracBrowser for help on using the repository browser.