source: main/trunk/greenstone2/perllib/util.pm@ 28236

Last change on this file since 28236 was 28236, checked in by ak19, 11 years ago

Added a parameter to specify whether paths on windows should be converted to long filenames or short (default is short) before getting being replaced by placeholders or when placeholders are replaced by absolute paths.

  • Property svn:keywords set to Author Date Id Revision
File size: 49.3 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29use FileUtils;
30
31use Encode;
32use File::Copy;
33use File::Basename;
34# Config for getting the perlpath in the recommended way, though it uses paths that are
35# hard-coded into the Config file that's generated upon configuring and compiling perl.
36# $^X works better in some cases to return the path to perl used to launch the script,
37# but if launched with plain "perl" (no full-path), that will be just what it returns.
38use Config;
39# New module for file related utility functions - intended as a
40# placeholder for an extension that allows a variety of different
41# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
42use FileUtils;
43
44if ($ENV{'GSDLOS'} =~ /^windows$/i) {
45 require Win32; # for working out Windows Long Filenames from Win 8.3 short filenames
46}
47
48# removes files (but not directories)
49sub rm {
50 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
51 return &FileUtils::removeFiles(@_);
52}
53
54# recursive removal
55sub filtered_rm_r {
56 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
57 return &FileUtils::removeFilesFiltered(@_);
58}
59
60# recursive removal
61sub rm_r {
62 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::removeFilesRecursive() instead");
63 return &FileUtils::removeFilesRecursive(@_);
64}
65
66# moves a file or a group of files
67sub mv {
68 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
69 return &FileUtils::moveFiles(@_);
70}
71
72# Move the contents of source directory into target directory
73# (as opposed to merely replacing target dir with the src dir)
74# This can overwrite any files with duplicate names in the target
75# but other files and folders in the target will continue to exist
76sub mv_dir_contents {
77 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
78 return &FileUtils::moveDirectoryContents(@_);
79}
80
81# copies a file or a group of files
82sub cp {
83 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
84 return &FileUtils::copyFiles(@_);
85}
86
87# recursively copies a file or group of files
88# syntax: cp_r (sourcefiles, destination directory)
89# destination must be a directory - to copy one file to
90# another use cp instead
91sub cp_r {
92 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
93 return &FileUtils::copyFilesRecursive(@_);
94}
95
96# recursively copies a file or group of files
97# syntax: cp_r (sourcefiles, destination directory)
98# destination must be a directory - to copy one file to
99# another use cp instead
100sub cp_r_nosvn {
101 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
102 return &FileUtils::copyFilesRecursiveNoSVN(@_);
103}
104
105# copies a directory and its contents, excluding subdirectories, into a new directory
106sub cp_r_toplevel {
107 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
108 return &FileUtils::recursiveCopyTopLevel(@_);
109}
110
111sub mk_dir {
112 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
113 return &FileUtils::makeDirectory(@_);
114}
115
116# in case anyone cares - I did some testing (using perls Benchmark module)
117# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
118# slightly faster (surprisingly) - Stefan.
119sub mk_all_dir {
120 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
121 return &FileUtils::makeAllDirectories(@_);
122}
123
124# make hard link to file if supported by OS, otherwise copy the file
125sub hard_link {
126 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
127 return &FileUtils::hardLink(@_);
128}
129
130# make soft link to file if supported by OS, otherwise copy file
131sub soft_link {
132 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
133 return &FileUtils::softLink(@_);
134}
135
136# Primarily for filenames generated by processing
137# content of HTML files (which are mapped to UTF-8 internally)
138#
139# To turn this into an octet string that really exists on the file
140# system:
141# 1. don't need to do anything special for Unix-based systems
142# (as underlying file system is byte-code)
143# 2. need to map to short DOS filenames for Windows
144
145sub utf8_to_real_filename
146{
147 my ($utf8_filename) = @_;
148
149 my $real_filename;
150
151 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
152 require Win32;
153
154 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
155
156 my $unicode_filename = decode("utf8",$utf8_filename);
157 $real_filename = Win32::GetShortPathName($unicode_filename);
158 }
159 else {
160 $real_filename = $utf8_filename;
161 }
162
163 return $real_filename;
164}
165
166sub fd_exists {
167 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
168 return &FileUtils::fileTest(@_);
169}
170
171sub file_exists {
172 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
173 return &FileUtils::fileExists(@_);
174}
175
176sub dir_exists {
177 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
178 return &FileUtils::directoryExists(@_);
179}
180
181# updates a copy of a directory in some other part of the filesystem
182# verbosity settings are: 0=low, 1=normal, 2=high
183# both $fromdir and $todir should be absolute paths
184sub cachedir {
185 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
186 return &FileUtils::synchronizeDirectories(@_);
187}
188
189# this function returns -1 if either file is not found
190# assumes that $file1 and $file2 are absolute file names or
191# in the current directory
192# $file2 is allowed to be newer than $file1
193sub differentfiles {
194 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
195 return &FileUtils::differentFiles(@_);
196}
197
198
199sub get_tmp_filename
200{
201 my $file_ext = shift(@_) || undef;
202
203 my $opt_dot_file_ext = "";
204 if (defined $file_ext) {
205 if ($file_ext !~ m/\./) {
206 # no dot, so needs one added in at start
207 $opt_dot_file_ext = ".$file_ext"
208 }
209 else {
210 # allow for "extensions" such as _metadata.txt to be handled
211 # gracefully
212 $opt_dot_file_ext = $file_ext;
213 }
214 }
215
216 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
217 &FileUtils::makeAllDirectories ($tmpdir) unless -e $tmpdir;
218
219 my $count = 1000;
220 my $rand = int(rand $count);
221 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
222
223 while (-e $full_tmp_filename) {
224 $rand = int(rand $count);
225 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
226 $count++;
227 }
228
229 return $full_tmp_filename;
230}
231
232# These 2 are "static" variables used by the get_timestamped_tmp_folder() subroutine below and
233# belong with that function. They help ensure the timestamped tmp folders generated are unique.
234my $previous_timestamp = undef;
235my $previous_timestamp_f = 0; # frequency
236
237sub get_timestamped_tmp_folder
238{
239
240 my $tmp_dirname;
241 if(defined $ENV{'GSDLCOLLECTDIR'}) {
242 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
243 } elsif(defined $ENV{'GSDLHOME'}) {
244 $tmp_dirname = $ENV{'GSDLHOME'};
245 } else {
246 return undef;
247 }
248
249 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
250 &FileUtils::makeDirectory($tmp_dirname) if (!-e $tmp_dirname);
251
252 # add the timestamp into the path otherwise we can run into problems
253 # if documents have the same name
254 my $timestamp = time;
255
256 if (!defined $previous_timestamp || ($timestamp > $previous_timestamp)) {
257 $previous_timestamp_f = 0;
258 $previous_timestamp = $timestamp;
259 } else {
260 $previous_timestamp_f++;
261 }
262
263 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
264 $tmp_dirname = $time_tmp_dirname;
265 my $i = $previous_timestamp_f;
266
267 if($previous_timestamp_f > 0) {
268 $tmp_dirname = $time_tmp_dirname."_".$i;
269 $i++;
270 }
271 while (-e $tmp_dirname) {
272 $tmp_dirname = $time_tmp_dirname."_".$i;
273 $i++;
274 }
275 &FileUtils::makeDirectory($tmp_dirname);
276
277 return $tmp_dirname;
278}
279
280sub get_timestamped_tmp_filename_in_collection
281{
282
283 my ($input_filename, $output_ext) = @_;
284 # derive tmp filename from input filename
285 my ($tailname, $dirname, $suffix)
286 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
287
288 # softlink to collection tmp dir
289 my $tmp_dirname = &util::get_timestamped_tmp_folder();
290 $tmp_dirname = $dirname unless defined $tmp_dirname;
291
292 # following two steps copied from ConvertBinaryFile
293 # do we need them?? can't use them as is, as they use plugin methods.
294
295 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
296
297 # URLEncode this since htmls with images where the html filename is utf8 don't seem
298 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
299 # files on the filesystem.
300 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
301 if (defined $output_ext) {
302 $output_ext = ".$output_ext"; # add the dot
303 } else {
304 $output_ext = $suffix;
305 }
306 $output_ext= lc($output_ext);
307 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
308
309 return $tmp_filename;
310}
311
312sub get_toplevel_tmp_dir
313{
314 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
315}
316
317
318sub filename_to_regex {
319 my $filename = shift (@_);
320
321 # need to make single backslashes double so that regex works
322 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
323
324 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
325 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
326 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
327 $filename =~ s@\(@\\(@g; # escape brackets
328 $filename =~ s@\)@\\)@g; # escape brackets
329 $filename =~ s@\[@\\[@g; # escape brackets
330 $filename =~ s@\]@\\]@g; # escape brackets
331
332 return $filename;
333}
334
335sub unregex_filename {
336 my $filename = shift (@_);
337
338 # need to put doubled backslashes for regex back to single
339 $filename =~ s/\\\./\./g; # remove RE syntax for .
340 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
341 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
342 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
343 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
344
345 # \\ goes to \
346 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
347 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
348 return $filename;
349}
350
351sub filename_cat {
352 # I've disabled this warning for now, as every Greenstone perl
353 # script seems to make use of this function and so you drown in a
354 # sea of deprecated warnings [jmt12]
355# warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
356 return &FileUtils::filenameConcatenate(@_);
357}
358
359
360sub pathname_cat {
361 my $first_path = shift(@_);
362 my (@pathnames) = @_;
363
364 # If first_path is not null or empty, then add it back into the list
365 if (defined $first_path && $first_path =~ /\S/) {
366 unshift(@pathnames, $first_path);
367 }
368
369 my $join_char;
370 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
371 $join_char = ";";
372 } else {
373 $join_char = ":";
374 }
375
376 my $pathname = join($join_char, @pathnames);
377
378 # remove duplicate slashes
379 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
380 $pathname =~ s/[\\\/]+/\\/g;
381 } else {
382 $pathname =~ s/[\/]+/\//g;
383 # DB: want a pathname abc\de.html to remain like this
384 }
385
386 return $pathname;
387}
388
389
390sub tidy_up_oid {
391 my ($OID) = @_;
392 if ($OID =~ /\./) {
393 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
394 $OID =~ s/\.//g; #remove any periods
395 }
396 if ($OID =~ /^\s.*\s$/) {
397 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
398 # remove starting and trailing whitespace
399 $OID =~ s/^\s+//;
400 $OID =~ s/\s+$//;
401 }
402 if ($OID =~ /^[\d]*$/) {
403 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
404 $OID = "D" . $OID;
405 }
406
407 return $OID;
408}
409
410sub envvar_prepend {
411 my ($var,$val) = @_;
412
413 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
414 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
415
416 # do not prepend any value/path that's already in the environment variable
417
418 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
419 if (!defined($ENV{$var})) {
420 $ENV{$var} = "$val";
421 }
422 elsif($ENV{$var} !~ m/$escaped_val/) {
423 $ENV{$var} = "$val".$pathsep.$ENV{$var};
424 }
425}
426
427sub envvar_append {
428 my ($var,$val) = @_;
429
430 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
431 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
432
433 # do not append any value/path that's already in the environment variable
434
435 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
436 if (!defined($ENV{$var})) {
437 $ENV{$var} = "$val";
438 }
439 elsif($ENV{$var} !~ m/$escaped_val/) {
440 $ENV{$var} = $ENV{$var}.$pathsep."$val";
441 }
442}
443
444
445# splits a filename into a prefix and a tail extension using the tail_re, or
446# if that fails, splits on the file_extension . (dot)
447sub get_prefix_and_tail_by_regex {
448
449 my ($filename,$tail_re) = @_;
450
451 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
452 if ((!defined $file_prefix) || (!defined $file_ext)) {
453 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
454 }
455
456 return ($file_prefix,$file_ext);
457}
458
459# get full path and file only path from a base_dir (which may be empty) and
460# file (which may contain directories)
461sub get_full_filenames {
462 my ($base_dir, $file) = @_;
463
464 my $filename_full_path = $file;
465 # add on directory if present
466 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
467
468 my $filename_no_path = $file;
469
470 # remove directory if present
471 $filename_no_path =~ s/^.*[\/\\]//;
472 return ($filename_full_path, $filename_no_path);
473}
474
475# returns the path of a file without the filename -- ie. the directory the file is in
476sub filename_head {
477 my $filename = shift(@_);
478
479 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
480 $filename =~ s/[^\\\\]*$//;
481 }
482 else {
483 $filename =~ s/[^\\\/]*$//;
484 }
485
486 return $filename;
487}
488
489
490
491# returns 1 if filename1 and filename2 point to the same
492# file or directory
493sub filenames_equal {
494 my ($filename1, $filename2) = @_;
495
496 # use filename_cat to clean up trailing slashes and
497 # multiple slashes
498 $filename1 = &FileUtils::filenameConcatenate($filename1);
499 $filename2 = &FileUtils::filenameConcatenate($filename2);
500
501 # filenames not case sensitive on windows
502 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
503 $filename1 =~ tr/[A-Z]/[a-z]/;
504 $filename2 =~ tr/[A-Z]/[a-z]/;
505 }
506 return 1 if $filename1 eq $filename2;
507 return 0;
508}
509
510# If filename is relative to within_dir, returns the relative path of filename to that directory
511# with slashes in the filename returned as they were in the original (absolute) filename.
512sub filename_within_directory
513{
514 my ($filename,$within_dir) = @_;
515
516 if ($within_dir !~ m/[\/\\]$/) {
517 my $dirsep = &util::get_dirsep();
518 $within_dir .= $dirsep;
519 }
520
521 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
522 if ($filename =~ m/^$within_dir(.*)$/) {
523 $filename = $1;
524 }
525
526 return $filename;
527}
528
529# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
530# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
531# The subpath returned will also be a URL type filename.
532sub filename_within_directory_url_format
533{
534 my ($filename,$within_dir) = @_;
535
536 # convert parameters only to / slashes if Windows
537
538 my $filename_urlformat = &filepath_to_url_format($filename);
539 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
540
541 #if ($within_dir_urlformat !~ m/\/$/) {
542 # make sure directory ends with a slash
543 #$within_dir_urlformat .= "/";
544 #}
545
546 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
547
548 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
549
550 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
551 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
552 $filename_urlformat = $1;
553 }
554
555 return $filename_urlformat;
556}
557
558# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
559# since on Linux it doesn't represent a file separator but an escape char).
560sub filepath_to_url_format
561{
562 my ($filepath) = @_;
563 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
564 # Only need to worry about Windows, as Unix style directories already in url-format
565 # Convert Windows style \ => /
566 $filepath =~ s@\\@/@g;
567 }
568 return $filepath;
569}
570
571# regex filepaths on windows may include \\ as path separator. Convert \\ to /
572sub filepath_regex_to_url_format
573{
574 my ($filepath) = @_;
575 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
576 # Only need to worry about Windows, as Unix style directories already in url-format
577 # Convert Windows style \\ => /
578 $filepath =~ s@\\\\@/@g;
579 }
580 return $filepath;
581
582}
583
584# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
585# and ignores trailing /
586# returns (file, dirs) dirs will be empty if no subdirs
587sub url_fileparse
588{
589 my ($filepath) = @_;
590 # remove trailing /
591 $filepath =~ s@/$@@;
592 if ($filepath !~ m@/@) {
593 return ($filepath, "");
594 }
595 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
596 return ($file, $dirs);
597
598}
599
600
601sub filename_within_collection
602{
603 my ($filename) = @_;
604
605 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
606
607 if (defined $collect_dir) {
608
609 # if from within GSDLCOLLECTDIR, then remove directory prefix
610 # so source_filename is realative to it. This is done to aid
611 # portability, i.e. the collection can be moved to somewhere
612 # else on the file system and the archives directory will still
613 # work. This is needed, for example in the applet version of
614 # GLI where GSDLHOME/collect on the server will be different to
615 # the collect directory of the remove user. Of course,
616 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
617 # it back into a full pathname.
618
619 $filename = filename_within_directory($filename,$collect_dir);
620 }
621
622 return $filename;
623}
624
625sub prettyprint_file
626{
627 my ($base_dir,$file,$gli) = @_;
628
629 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
630
631 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
632 require Win32;
633
634 # For some reason base_dir in the form c:/a/b/c
635 # This leads to confusion later on, so turn it back into
636 # the more usual Windows form
637 $base_dir =~ s/\//\\/g;
638 my $long_base_dir = Win32::GetLongPathName($base_dir);
639 my $long_full_path = Win32::GetLongPathName($filename_full_path);
640
641 $file = filename_within_directory($long_full_path,$long_base_dir);
642 $file = encode("utf8",$file) if ($gli);
643 }
644
645 return $file;
646}
647
648
649sub upgrade_if_dos_filename
650{
651 my ($filename_full_path,$and_encode) = @_;
652
653 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
654 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
655 # to its long (Windows) version
656 my $long_filename = Win32::GetLongPathName($filename_full_path);
657 if (defined $long_filename) {
658 $filename_full_path = $long_filename;
659 }
660 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
661 $filename_full_path =~ s/^(.):/\u$1:/;
662 if ((defined $and_encode) && ($and_encode)) {
663 $filename_full_path = encode("utf8",$filename_full_path);
664 }
665 }
666
667 return $filename_full_path;
668}
669
670
671sub downgrade_if_dos_filename
672{
673 my ($filename_full_path) = @_;
674
675 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
676 require Win32;
677
678 # Ensure the given long Windows filename is in a form that can
679 # be opened by Perl => convert it to a short DOS-like filename
680
681 my $short_filename = Win32::GetShortPathName($filename_full_path);
682 if (defined $short_filename) {
683 $filename_full_path = $short_filename;
684 }
685 # Make sure initial drive letter is lower-case (to fit in
686 # with rest of Greenstone)
687 $filename_full_path =~ s/^(.):/\u$1:/;
688 }
689
690 return $filename_full_path;
691}
692
693sub block_filename
694{
695 my ($block_hash,$filename) = @_;
696
697 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
698
699 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
700 my $lower_filename = lc($filename);
701 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
702# my $lower_drive = $filename;
703# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
704
705# my $upper_drive = $filename;
706# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
707#
708# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
709# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
710 }
711 else {
712 $block_hash->{'file_blocks'}->{$filename} = 1;
713 }
714}
715
716
717sub filename_is_absolute
718{
719 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
720 return &FileUtils::isFilenameAbsolute(@_);
721}
722
723
724## @method make_absolute()
725#
726# Ensure the given file path is absolute in respect to the given base path.
727#
728# @param $base_dir A string denoting the base path the given dir must be
729# absolute to.
730# @param $dir The directory to be made absolute as a string. Note that the
731# dir may already be absolute, in which case it will remain
732# unchanged.
733# @return The now absolute form of the directory as a string.
734#
735# @author John Thompson, DL Consulting Ltd.
736# @copy 2006 DL Consulting Ltd.
737#
738#used in buildcol.pl, doesn't work for all cases --kjdon
739sub make_absolute {
740
741 my ($base_dir, $dir) = @_;
742### print STDERR "dir = $dir\n";
743 $dir =~ s/[\\\/]+/\//g;
744 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
745 $dir =~ s|^/tmp_mnt||;
746 1 while($dir =~ s|/[^/]*/\.\./|/|g);
747 $dir =~ s|/[.][.]?/|/|g;
748 $dir =~ tr|/|/|s;
749### print STDERR "dir = $dir\n";
750
751 return $dir;
752}
753## make_absolute() ##
754
755sub get_dirsep {
756
757 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
758 return "\\";
759 } else {
760 return "\/";
761 }
762}
763
764sub get_os_dirsep {
765
766 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
767 return "\\\\";
768 } else {
769 return "\\\/";
770 }
771}
772
773sub get_re_dirsep {
774
775 return "\\\\|\\\/";
776}
777
778
779sub get_dirsep_tail {
780 my ($filename) = @_;
781
782 # returns last part of directory or filename
783 # On unix e.g. a/b.d => b.d
784 # a/b/c => c
785
786 my $dirsep = get_re_dirsep();
787 my @dirs = split (/$dirsep/, $filename);
788 my $tail = pop @dirs;
789
790 # - caused problems under windows
791 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
792
793 return $tail;
794}
795
796
797# if this is running on windows we want binaries to end in
798# .exe, otherwise they don't have to end in any extension
799sub get_os_exe {
800 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
801 return "";
802}
803
804
805# test to see whether this is a big or little endian machine
806sub is_little_endian
807{
808 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
809 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
810 # Otherwise, it's little endian
811
812 #return 0 if $^O =~ /^darwin$/i;
813 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
814
815 # Going back to stating exactly whether the machine is little endian
816 # or big endian, without any special case for Macs. Since for rata it comes
817 # back with little endian and for shuttle with bigendian.
818 return (ord(substr(pack("s",1), 0, 1)) == 1);
819}
820
821
822# will return the collection name if successful, "" otherwise
823sub use_collection {
824 my ($collection, $collectdir) = @_;
825
826 if (!defined $collectdir || $collectdir eq "") {
827 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
828 }
829
830 if (!defined $ENV{'GREENSTONEHOME'}) { # for GS3, would have been defined in use_site_collection, to GSDL3HOME
831 $ENV{'GREENSTONEHOME'} = $ENV{'GSDLHOME'};
832 }
833
834 # get and check the collection
835 if (!defined($collection) || $collection eq "") {
836 if (defined $ENV{'GSDLCOLLECTION'}) {
837 $collection = $ENV{'GSDLCOLLECTION'};
838 } else {
839 print STDOUT "No collection specified\n";
840 return "";
841 }
842 }
843
844 if ($collection eq "modelcol") {
845 print STDOUT "You can't use modelcol.\n";
846 return "";
847 }
848
849 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
850 # are defined
851 $ENV{'GSDLCOLLECTION'} = $collection;
852 $ENV{'GSDLCOLLECTHOME'} = $collectdir;
853 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
854
855 # make sure this collection exists
856 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
857 print STDOUT "Invalid collection ($collection).\n";
858 return "";
859 }
860
861 # everything is ready to go
862 return $collection;
863}
864
865sub get_current_collection_name {
866 return $ENV{'GSDLCOLLECTION'};
867}
868
869
870# will return the collection name if successful, "" otherwise.
871# Like use_collection (above) but for greenstone 3 (taking account of site level)
872
873sub use_site_collection {
874 my ($site, $collection, $collectdir) = @_;
875
876 if (!defined $collectdir || $collectdir eq "") {
877 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
878 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
879 }
880
881 if (defined $ENV{'GSDL3HOME'}) {
882 $ENV{'GREENSTONEHOME'} = $ENV{'GSDL3HOME'};
883 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
884 } elsif (defined $ENV{'GSDL3SRCHOME'}) {
885 $ENV{'GREENSTONEHOME'} = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "web");
886 $ENV{'SITEHOME'} = &FileUtils::filenameConcatenate($ENV{'GREENSTONEHOME'}, "sites", $site);
887 } else {
888 print STDERR "*** util::use_site_collection(). Warning: Neither GSDL3HOME nor GSDL3SRCHOME set.\n";
889 }
890
891 # collectdir explicitly set by this point (using $site variable if required).
892 # Can call "old" gsdl2 use_collection now.
893
894 return use_collection($collection,$collectdir);
895}
896
897
898
899sub locate_config_file
900{
901 my ($file) = @_;
902
903 my $locations = locate_config_files($file);
904
905 return shift @$locations; # returns undef if 'locations' is empty
906}
907
908
909sub locate_config_files
910{
911 my ($file) = @_;
912
913 my @locations = ();
914
915 if (-e $file) {
916 # Clearly specified (most likely full filename)
917 # No need to hunt in 'etc' directories, return value unchanged
918 push(@locations,$file);
919 }
920 else {
921 # Check for collection specific one before looking in global GSDL 'etc'
922 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
923 my $test_collect_etc_filename
924 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
925
926 if (-e $test_collect_etc_filename) {
927 push(@locations,$test_collect_etc_filename);
928 }
929 }
930 my $test_main_etc_filename
931 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
932 if (-e $test_main_etc_filename) {
933 push(@locations,$test_main_etc_filename);
934 }
935 }
936
937 return \@locations;
938}
939
940
941sub hyperlink_text
942{
943 my ($text) = @_;
944
945 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
946 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
947
948 return $text;
949}
950
951
952# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
953# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
954sub is_dir_empty {
955 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
956 return &FileUtils::isDirectoryEmpty(@_);
957}
958
959# Returns the given filename converted using either URL encoding or base64
960# encoding, as specified by $rename_method. If the given filename has no suffix
961# (if it is just the tailname), then $no_suffix should be some defined value.
962# rename_method can be url, none, base64
963sub rename_file {
964 my ($filename, $rename_method, $no_suffix) = @_;
965
966 if(!$filename) { # undefined or empty string
967 return $filename;
968 }
969
970 if (!$rename_method) {
971 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
972 # Debugging information
973 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
974 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
975 $rename_method = "url";
976 } elsif($rename_method eq "none") {
977 return $filename; # would have already been renamed
978 }
979
980 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
981 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
982 ###$filename =~ s/ /_/g;
983
984 my ($tailname,$dirname,$suffix);
985 if($no_suffix) { # given a tailname, no suffix
986 ($tailname,$dirname) = File::Basename::fileparse($filename);
987 }
988 else {
989 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
990 }
991 if (!$suffix) {
992 $suffix = "";
993 }
994 # This breaks GLI matching extracted metadata to files in Enrich panel, as
995 # original is eg .JPG while gsdlsourcefilename ends up .jpg
996 # Not sure why it was done in first place...
997 #else {
998 #$suffix = lc($suffix);
999 #}
1000
1001 if ($rename_method eq "url") {
1002 $tailname = &unicode::url_encode($tailname);
1003 }
1004 elsif ($rename_method eq "base64") {
1005 $tailname = &unicode::base64_encode($tailname);
1006 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1007 }
1008
1009 $filename = "$tailname$suffix";
1010 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1011
1012 return $filename;
1013}
1014
1015
1016# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1017sub rename_ldb_or_bdb_file {
1018 my ($filename_no_ext) = @_;
1019
1020 my $new_filename = "$filename_no_ext.gdb";
1021 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1022 # try ldb
1023 my $old_filename = "$filename_no_ext.ldb";
1024
1025 if (-f $old_filename) {
1026 print STDERR "Renaming $old_filename to $new_filename\n";
1027 rename ($old_filename, $new_filename)
1028 || print STDERR "Rename failed: $!\n";
1029 return;
1030 }
1031 # try bdb
1032 $old_filename = "$filename_no_ext.bdb";
1033 if (-f $old_filename) {
1034 print STDERR "Renaming $old_filename to $new_filename\n";
1035 rename ($old_filename, $new_filename)
1036 || print STDERR "Rename failed: $!\n";
1037 return;
1038 }
1039}
1040
1041sub os_dir() {
1042
1043 my $gsdlarch = "";
1044 if(defined $ENV{'GSDLARCH'}) {
1045 $gsdlarch = $ENV{'GSDLARCH'};
1046 }
1047 return $ENV{'GSDLOS'}.$gsdlarch;
1048}
1049
1050# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1051# By default, /greenstone3 for GS3 or /greenstone for GS2.
1052sub get_greenstone_url_prefix() {
1053 # if already set on a previous occasion, just return that
1054 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1055 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1056
1057 my ($configfile, $urlprefix, $defaultUrlprefix);
1058 my @propertynames = ();
1059
1060 if($ENV{'GSDL3SRCHOME'}) {
1061 $defaultUrlprefix = "/greenstone3";
1062 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1063 push(@propertynames, qw/path\s*\=/);
1064 } else {
1065 $defaultUrlprefix = "/greenstone";
1066 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1067 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1068 }
1069
1070 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1071
1072 if(!$urlprefix) { # no values found for URL prefix, use default values
1073 $urlprefix = $defaultUrlprefix;
1074 } else {
1075 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1076 $urlprefix =~ s/^\///; # remove the starting slash
1077 my @dirs = split(/(\\|\/)/, $urlprefix);
1078 $urlprefix = shift(@dirs);
1079
1080 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1081 $urlprefix = "/$urlprefix";
1082 }
1083 }
1084
1085 # set for the future
1086 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1087# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1088 return $urlprefix;
1089}
1090
1091
1092# Given a config file (xml or java properties file) and a list/array of regular expressions
1093# that represent property names to match on, this function will return the value for the 1st
1094# matching property name. If the return value is undefined, no matching property was found.
1095sub extract_propvalue_from_file() {
1096 my ($configfile, $propertynames) = @_;
1097
1098 my $value;
1099 unless(open(FIN, "<$configfile")) {
1100 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1101 return $value; # not initialised
1102 }
1103
1104 # Read the entire file at once, as one single line, then close it
1105 my $filecontents;
1106 {
1107 local $/ = undef;
1108 $filecontents = <FIN>;
1109 }
1110 close(FIN);
1111
1112 foreach my $regex (@$propertynames) {
1113 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1114 if($value) {
1115 $value =~ s/^\"//; # remove any startquotes
1116 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1117 last; # found value for a matching property, break from loop
1118 }
1119 }
1120
1121 return $value;
1122}
1123
1124# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1125# given that perllib is in @INC in order to invoke this subroutine.
1126# Call as follows -- after setting up INC to include perllib and
1127# after setting up GSDLHOME and GSDLOS:
1128#
1129# require util;
1130# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1131#
1132sub setup_greenstone_env() {
1133 my ($GSDLHOME, $GSDLOS) = @_;
1134
1135 #my %env_map = ();
1136 # Get the localised ENV settings of running a localised source setup.bash
1137 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1138 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1139 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1140 if($GSDLOS =~ m/windows/i) {
1141 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1142 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1143 }
1144 if (!open(PIN, "$perl_command |")) {
1145 print STDERR ("Unable to execute command: $perl_command. $!\n");
1146 }
1147
1148 while (defined (my $perl_output_line = <PIN>)) {
1149 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1150 #$env_map{$key}=$value;
1151 $ENV{$key}=$value;
1152 }
1153 close (PIN);
1154
1155 # If any keys in $ENV don't occur in Greenstone's localised env
1156 # (stored in $env_map), delete those entries from $ENV
1157 #foreach $key (keys %ENV) {
1158 # if(!defined $env_map{$key}) {
1159 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1160 # delete $ENV{$key}; # del $ENV(key, value) pair
1161 # }
1162 #}
1163 #undef %env_map;
1164}
1165
1166sub get_perl_exec() {
1167 my $perl_exec = $^X; # may return just "perl"
1168
1169 if($ENV{'PERLPATH'}) {
1170 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1171 if($ENV{'GSDLOS'} =~ m/windows/) {
1172 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1173 } else {
1174 $perl_exec = "$ENV{'PERLPATH'}/perl";
1175 }
1176 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1177 # containing the full path to the current perl executable we're using
1178 $perl_exec = $Config{perlpath}; # configured path for perl
1179 if (!-e $perl_exec) { # may not point to location on this machine
1180 $perl_exec = $^X; # may return just "perl"
1181 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1182 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1183 }
1184 }
1185 }
1186
1187 return $perl_exec;
1188}
1189
1190# returns the path to the java command in the JRE included with GS (if any),
1191# quoted to safeguard any spaces in this path, otherwise a simple java
1192# command is returned which assumes and will try for a system java.
1193sub get_java_command {
1194 my $java = "java";
1195 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1196 # after running setup.bat or from GLI which also runs setup.bat
1197 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1198 if(-d $java_bin) {
1199 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1200 $java = "\"".$java."\""; # quoted to preserve spaces in path
1201 }
1202 }
1203 return $java;
1204}
1205
1206
1207# Given the qualified collection name (colgroup/collection),
1208# returns the collection and colgroup parts
1209sub get_collection_parts {
1210 # http://perldoc.perl.org/File/Basename.html
1211 # my($filename, $directories, $suffix) = fileparse($path);
1212 # "$directories contains everything up to and including the last directory separator in the $path
1213 # including the volume (if applicable). The remainder of the $path is the $filename."
1214 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1215
1216 my $qualified_collection = shift(@_);
1217
1218 # Since activate.pl can be launched from the command-line, including by a user,
1219 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1220 # Also allow for the accidental inclusion of multiple slashes
1221 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1222
1223 if(!defined $collection) {
1224 $collection = $colgroup;
1225 $colgroup = "";
1226 }
1227 return ($collection, $colgroup);
1228}
1229
1230# work out the "collectdir/collection" location
1231sub resolve_collection_dir {
1232 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1233
1234 if (defined $ENV{'GSDLCOLLECTDIR'}) { # a predefined collection dir exists
1235 return $ENV{'GSDLCOLLECTDIR'};
1236 }
1237
1238 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1239
1240 if (!defined $collect_dir || !$collect_dir) { # if undefined or empty string
1241 $collect_dir = &util::get_working_collect_dir($site);
1242 }
1243
1244 return &FileUtils::filenameConcatenate($collect_dir,$colgroup,$collection);
1245}
1246
1247# work out the full path to "collect" of this greenstone 2/3 installation
1248sub get_working_collect_dir {
1249 my ($site) = @_;
1250
1251 if (defined $ENV{'GSDLCOLLECTHOME'}) { # a predefined collect dir exists
1252 return $ENV{'GSDLCOLLECTHOME'};
1253 }
1254
1255 if (defined $site && $site) { # site non-empty, so get default collect dir for GS3
1256
1257 if (defined $ENV{'GSDL3HOME'}) {
1258 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect"); # web folder
1259 }
1260 elsif (defined $ENV{'GSDL3SRCHOME'}) {
1261 return &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'},"web","sites",$site,"collect");
1262 }
1263 }
1264
1265 elsif (defined $ENV{'SITEHOME'}) {
1266 return &FileUtils::filenameConcatenate($ENV{'SITEHOME'},"collect");
1267 }
1268
1269 else { # get default collect dir for GS2
1270 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect");
1271 }
1272}
1273
1274sub is_abs_path_any_os {
1275 my ($path) = @_;
1276
1277 # We can have filenames in our DBs that were produced on other OS, so this method exists
1278 # to help identify absolute paths in such cases.
1279
1280 return 1 if($path =~ m@^/@); # full paths begin with forward slash on linux/mac
1281 return 1 if($path =~ m@^([a-zA-Z]\:|\\)@); # full paths begin with drive letter colon for Win or \ for volume, http://stackoverflow.com/questions/13011013/get-only-volume-name-from-filepath
1282
1283 return 0;
1284}
1285
1286
1287# This subroutine is for improving portability of Greenstone collections from one OS to another,
1288# to be used to convert absolute paths going into db files into paths with placeholders instead.
1289# This sub works with util::get_common_gs_paths and takes a path to a greenstone file and, if it's
1290# an absolute path, then it will replace the longest matching greenstone-path prefix of the given
1291# path with a placeholder to match.
1292# The Greenstone-path prefixes that can be matched are the following common Greenstone paths:
1293# the path to the current (specific) collection, the path to the general GS collect directory,
1294# the path to the site directory if GS3, else the path to the GSDLHOME/GSDL3HOME folder.
1295# The longest matching prefix will be replaced with the equivalent placeholder:
1296# @THISCOLLECTPATH@, else @COLLECTHOME@, else @SITEHOME@, else @GSDLHOME@.
1297sub abspath_to_placeholders {
1298 my $path = shift(@_); # path to convert from absolute to one with placeholders
1299 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1300
1301 return $path unless is_abs_path_any_os($path); # path is relative
1302
1303 if ($opt_long_or_short_winfilenames eq "long") {
1304 $path = &util::upgrade_if_dos_filename($path); # will only do something on windows
1305 }
1306
1307 # now we know we're dealing with absolute paths and have to replace gs prefixes with placeholders
1308 my @gs_paths = ($ENV{'GSDLCOLLECTDIR'}, $ENV{'GSDLCOLLECTHOME'}, $ENV{'SITEHOME'}, $ENV{'GREENSTONEHOME'}); # list in this order: from longest to shortest path
1309
1310 my %placeholder_map = ($ENV{'GREENSTONEHOME'} => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1311 $ENV{'GSDLCOLLECTHOME'} => '@COLLECTHOME@',
1312 $ENV{'GSDLCOLLECTDIR'} => '@THISCOLLECTPATH@'
1313 );
1314 $placeholder_map{$ENV{'SITEHOME'}} = '@SITEHOME@' if defined $ENV{'SITEHOME'};
1315
1316 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1317
1318 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1319 # for windows need to look for matches on short file names too
1320 # matched paths are again to be replaced with the usual placeholders
1321
1322 my $gsdlcollectdir = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'});
1323 my $gsdlcollecthome = &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'});
1324 my $sitehome = (defined $ENV{'SITEHOME'}) ? &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) : undef;
1325 my $greenstonehome = &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'});
1326
1327 @gs_paths = ($gsdlcollectdir, $gsdlcollecthome, $sitehome, $greenstonehome); # order matters
1328
1329 %placeholder_map = ($greenstonehome => '@GSDLHOME@', # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1330 $gsdlcollecthome => '@COLLECTHOME@',
1331 $gsdlcollectdir => '@THISCOLLECTPATH@'
1332 );
1333 $placeholder_map{$sitehome} = '@SITEHOME@' if defined $sitehome;
1334
1335 $path = &util::_abspath_to_placeholders($path, \@gs_paths, \%placeholder_map);
1336 }
1337
1338 return $path;
1339}
1340
1341sub _abspath_to_placeholders {
1342 my ($path, $gs_paths_ref, $placeholder_map_ref) = @_;
1343
1344 # The sequence of elements in @gs_paths matters
1345 # Need to loop starting from the *longest* matching path (the path to the specific collection)
1346 # to the shortest matching path (the path to gsdlhome/gsdl3home folder):
1347
1348 foreach my $gs_path (@$gs_paths_ref) {
1349 next if(!defined $gs_path); # site undefined for GS2
1350
1351 my $re_path = &util::filename_to_regex($gs_path); # escape for regex
1352
1353 if($path =~ m/^$re_path/i) { # case sensitive or not for OS?
1354
1355 my $placeholder = $placeholder_map_ref->{$gs_path}; # get the placeholder to replace the matched path with
1356
1357 $path =~ s/^$re_path/$placeholder/; #case sensitive or not?
1358 #$path =~ s/^[\\\/]//; # remove gs_path's trailing separator left behind at the start of the path
1359 last; # done
1360 }
1361 }
1362
1363 return $path;
1364}
1365
1366# Function that does the reverse of the util::abspath_to_placeholders subroutine
1367# Once again, call this with the values returned from util::get_common_gs_paths
1368sub placeholders_to_abspath {
1369 my $path = shift(@_); # path that can contain placeholders to convert to resolved absolute path
1370 my $opt_long_or_short_winfilenames = shift(@_) || "short"; # whether we want to force use of long file names even on windows, default uses short
1371
1372 return $path if($path !~ m/@/); # path contains no placeholders
1373
1374 # replace placeholders with gs prefixes
1375 my @placeholders = ('@THISCOLLECTPATH@', '@COLLECTHOME@', '@SITEHOME@', '@GSDLHOME@'); # order of paths not crucial in this case,
1376 # but listed here from longest to shortest once placeholders are have been resolved
1377
1378 # can't use double-quotes around at-sign, else perl tries to evaluate it as referring to an array
1379 my %placeholder_to_gspath_map;
1380 if ($ENV{'GSDLOS'} =~ /^windows$/i && $opt_long_or_short_winfilenames eq "short") {
1381 # always replace placeholders with short file names of the absolute paths on windows?
1382 %placeholder_to_gspath_map = ('@GSDLHOME@' => &util::downgrade_if_dos_filename($ENV{'GREENSTONEHOME'}),
1383 '@COLLECTHOME@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTHOME'}),
1384 '@THISCOLLECTPATH@' => &util::downgrade_if_dos_filename($ENV{'GSDLCOLLECTDIR'})
1385 );
1386 $placeholder_to_gspath_map{'@SITEHOME@'} = &util::downgrade_if_dos_filename($ENV{'SITEHOME'}) if defined $ENV{'SITEHOME'};
1387 } else {
1388 %placeholder_to_gspath_map = ('@GSDLHOME@' => $ENV{'GREENSTONEHOME'},
1389 '@SITEHOME@' => $ENV{'SITEHOME'}, # can be undef
1390 '@COLLECTHOME@' => $ENV{'GSDLCOLLECTHOME'},
1391 '@THISCOLLECTPATH@' => $ENV{'GSDLCOLLECTDIR'}
1392 ); # $placeholder_to_gspath_map{'@SITEHOME@'} = $ENV{'SITEHOME'} if defined $ENV{'SITEHOME'};
1393 }
1394
1395 foreach my $placeholder (@placeholders) {
1396 my $gs_path = $placeholder_to_gspath_map{$placeholder};
1397
1398 next if(!defined $gs_path); # sitehome for GS2 is undefined
1399
1400 if($path =~ m/^$placeholder/) {
1401 $path =~ s/^$placeholder/$gs_path/;
1402 last; # done
1403 }
1404 }
1405
1406 return $path;
1407}
1408
1409# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1410# a directory containing sequentially numbered images.
1411sub create_itemfile
1412{
1413 my ($output_dir, $convert_basename, $convert_to) = @_;
1414 my $page_num = "";
1415
1416 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1417 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1418 closedir DIR;
1419
1420 # Sort files in the directory by page_num
1421 sub page_number {
1422 my ($dir) = @_;
1423 my ($pagenum) =($dir =~ m/^.*?[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1424# my ($pagenum) =($dir =~ m/(\d+)(\.(jpg|gif|png))?$/i); # this works but is not as safe/strict about input filepatterns as the above
1425
1426 $pagenum = 1 unless defined $pagenum;
1427 return $pagenum;
1428 }
1429
1430 # sort the files in the directory in the order of page_num rather than lexically.
1431 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1432
1433 # work out if the numbering of the now sorted image files starts at 0 or not
1434 # by checking the number of the first _image_ file (skipping item files)
1435 my $starts_at_0 = 0;
1436 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1437 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1438 $starts_at_0 = 1;
1439 }
1440
1441 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1442 my $item_fh;
1443 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1444 print $item_fh "<PagedDocument>\n";
1445
1446 foreach my $file (@dir_files){
1447 if ($file !~ /\.item/i){
1448 $page_num = page_number($file);
1449 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1450 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1451 }
1452 }
1453
1454 print $item_fh "</PagedDocument>\n";
1455 &FileUtils::closeFileHandle($item_file, \$item_fh);
1456 return $item_file;
1457}
1458
1459
1460## @function augmentINC()
1461#
1462# Prepend a path (if it exists) onto INC but only if it isn't already in INC
1463# @param $new_path The path to add
1464# @author jmt12
1465#
1466sub augmentINC
1467{
1468 my ($new_path) = @_;
1469 my $did_add_path = 0;
1470 # might need to be replaced with FileUtils::directoryExists() call eventually
1471 if (-d $new_path)
1472 {
1473 my $did_find_path = 0;
1474 foreach my $existing_path (@INC)
1475 {
1476 if ($existing_path eq $new_path)
1477 {
1478 $did_find_path = 1;
1479 last;
1480 }
1481 }
1482 if (!$did_find_path)
1483 {
1484 unshift(@INC, $new_path);
1485 $did_add_path = 1;
1486 }
1487 }
1488 return $did_add_path;
1489}
1490## augmentINC()
1491
1492
14931;
Note: See TracBrowser for help on using the repository browser.