source: main/trunk/greenstone2/perllib/util.pm@ 27303

Last change on this file since 27303 was 27303, checked in by jmt12, 11 years ago

Replacing hardcoded additions to INC and PATH environment variables with conditional ones - this allows us to use the order of values in these variables for precedence, thus allows better support for extensions that override classifiers, plugins etc. ENV and PATH functions already exists in util, but augmentINC() is a new function

  • Property svn:keywords set to Author Date Id Revision
File size: 40.5 KB
Line 
1###########################################################################
2#
3# util.pm -- various useful utilities
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package util;
27
28use strict;
29
30use Encode;
31use File::Copy;
32use File::Basename;
33# Config for getting the perlpath in the recommended way, though it uses paths that are
34# hard-coded into the Config file that's generated upon configuring and compiling perl.
35# $^X works better in some cases to return the path to perl used to launch the script,
36# but if launched with plain "perl" (no full-path), that will be just what it returns.
37use Config;
38# New module for file related utility functions - intended as a
39# placeholder for an extension that allows a variety of different
40# filesystems (FTP, HTTP, SAMBA, WEBDav, HDFS etc)
41use FileUtils;
42
43# removes files (but not directories)
44sub rm {
45 warnings::warnif("deprecated", "util::rm() is deprecated, using FileUtils::removeFiles() instead");
46 return &FileUtils::removeFiles(@_);
47}
48
49# recursive removal
50sub filtered_rm_r {
51 warnings::warnif("deprecated", "util::filtered_rm_r() is deprecated, using FileUtils::removeFilesFiltered() instead");
52 return &FileUtils::removeFilesFiltered(@_);
53}
54
55# recursive removal
56sub rm_r {
57 warnings::warnif("deprecated", "util::rm_r() is deprecated, using FileUtils::recursiveRemoveFiles() instead");
58 return &FileUtils::removeFilesRecursive(@_);
59}
60
61# moves a file or a group of files
62sub mv {
63 warnings::warnif("deprecated", "util::mv() is deprecated, using FileUtils::moveFiles() instead");
64 return &FileUtils::moveFiles(@_);
65}
66
67# Move the contents of source directory into target directory
68# (as opposed to merely replacing target dir with the src dir)
69# This can overwrite any files with duplicate names in the target
70# but other files and folders in the target will continue to exist
71sub mv_dir_contents {
72 warnings::warnif("deprecated", "util::mv_dir_contents() is deprecated, using FileUtils::moveDirectoryContents() instead");
73 return &FileUtils::moveDirectoryContents(@_);
74}
75
76# copies a file or a group of files
77sub cp {
78 warnings::warnif("deprecated", "util::cp() is deprecated, using FileUtils::copyFiles() instead");
79 return &FileUtils::copyFiles(@_);
80}
81
82# recursively copies a file or group of files
83# syntax: cp_r (sourcefiles, destination directory)
84# destination must be a directory - to copy one file to
85# another use cp instead
86sub cp_r {
87 warnings::warnif("deprecated", "util::cp_r() is deprecated, using FileUtils::copyFilesrecursive() instead");
88 return &FileUtils::copyFilesRecursive(@_);
89}
90
91# recursively copies a file or group of files
92# syntax: cp_r (sourcefiles, destination directory)
93# destination must be a directory - to copy one file to
94# another use cp instead
95sub cp_r_nosvn {
96 warnings::warnif("deprecated", "util::cp_r_nosvn() is deprecated, using FileUtils::copyFilesRecursiveNoSVN() instead");
97 return &FileUtils::copyFilesRecursiveNoSVN(@_);
98}
99
100# copies a directory and its contents, excluding subdirectories, into a new directory
101sub cp_r_toplevel {
102 warnings::warnif("deprecated", "util::cp_r_toplevel() is deprecated, using FileUtils::recursiveCopyTopLevel() instead");
103 return &FileUtils::recursiveCopyTopLevel(@_);
104}
105
106sub mk_dir {
107 warnings::warnif("deprecated", "util::mk_dir() is deprecated, using FileUtils::makeDirectory() instead");
108 return &FileUtils::makeDirectory(@_);
109}
110
111# in case anyone cares - I did some testing (using perls Benchmark module)
112# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
113# slightly faster (surprisingly) - Stefan.
114sub mk_all_dir {
115 warnings::warnif("deprecated", "util::mk_all_dir() is deprecated, using FileUtils::makeAllDirectories() instead");
116 return &FileUtils::makeAllDirectories(@_);
117}
118
119# make hard link to file if supported by OS, otherwise copy the file
120sub hard_link {
121 warnings::warnif("deprecated", "util::hard_link() is deprecated, using FileUtils::hardLink() instead");
122 return &FileUtils::hardLink(@_);
123}
124
125# make soft link to file if supported by OS, otherwise copy file
126sub soft_link {
127 warnings::warnif("deprecated", "util::soft_link() is deprecated, using FileUtils::softLink() instead");
128 return &FileUtils::softLink(@_);
129}
130
131# Primarily for filenames generated by processing
132# content of HTML files (which are mapped to UTF-8 internally)
133#
134# To turn this into an octet string that really exists on the file
135# system:
136# 1. don't need to do anything special for Unix-based systems
137# (as underlying file system is byte-code)
138# 2. need to map to short DOS filenames for Windows
139
140sub utf8_to_real_filename
141{
142 my ($utf8_filename) = @_;
143
144 my $real_filename;
145
146 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
147 require Win32;
148
149 print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
150
151 my $unicode_filename = decode("utf8",$utf8_filename);
152 $real_filename = Win32::GetShortPathName($unicode_filename);
153 }
154 else {
155 $real_filename = $utf8_filename;
156 }
157
158 return $real_filename;
159}
160
161sub fd_exists {
162 warnings::warnif("deprecated", "util::fd_exists() is deprecated, using FileUtils::fileTest() instead");
163 return &FileUtils::fileTest(@_);
164}
165
166sub file_exists {
167 warnings::warnif("deprecated", "util::file_exists() is deprecated, using FileUtils::fileExists() instead");
168 return &FileUtils::fileExists(@_);
169}
170
171sub dir_exists {
172 warnings::warnif("deprecated", "util::dir_exists() is deprecated, using FileUtils::directoryExists() instead");
173 return &FileUtils::directoryExists(@_);
174}
175
176# updates a copy of a directory in some other part of the filesystem
177# verbosity settings are: 0=low, 1=normal, 2=high
178# both $fromdir and $todir should be absolute paths
179sub cachedir {
180 warnings::warnif("deprecated", "util::cachedir() is deprecated, using FileUtils::synchronizeDirectories() instead");
181 return &FileUtils::synchronizeDirectories(@_);
182}
183
184# this function returns -1 if either file is not found
185# assumes that $file1 and $file2 are absolute file names or
186# in the current directory
187# $file2 is allowed to be newer than $file1
188sub differentfiles {
189 warnings::warnif("deprecated", "util::differentfiles() is deprecated, using FileUtils::differentFiles() instead");
190 return &FileUtils::differentFiles(@_);
191}
192
193
194sub get_tmp_filename
195{
196 my $file_ext = shift(@_) || undef;
197
198 my $opt_dot_file_ext = "";
199 if (defined $file_ext) {
200 if ($file_ext !~ m/\./) {
201 # no dot, so needs one added in at start
202 $opt_dot_file_ext = ".$file_ext"
203 }
204 else {
205 # allow for "extensions" such as _metadata.txt to be handled
206 # gracefully
207 $opt_dot_file_ext = $file_ext;
208 }
209 }
210
211 my $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
212 &mk_all_dir ($tmpdir) unless -e $tmpdir;
213
214 my $count = 1000;
215 my $rand = int(rand $count);
216 my $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
217
218 while (-e $full_tmp_filename) {
219 $rand = int(rand $count);
220 $full_tmp_filename = &FileUtils::filenameConcatenate($tmpdir, "F$rand$opt_dot_file_ext");
221 $count++;
222 }
223
224 return $full_tmp_filename;
225}
226
227sub get_timestamped_tmp_folder
228{
229
230 my $tmp_dirname;
231 if(defined $ENV{'GSDLCOLLECTDIR'}) {
232 $tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
233 } elsif(defined $ENV{'GSDLHOME'}) {
234 $tmp_dirname = $ENV{'GSDLHOME'};
235 } else {
236 return undef;
237 }
238
239 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "tmp");
240 &util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
241
242 # add the timestamp into the path otherwise we can run into problems
243 # if documents have the same name
244 my $timestamp = time;
245 my $time_tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, $timestamp);
246 $tmp_dirname = $time_tmp_dirname;
247 my $i = 1;
248 while (-e $tmp_dirname) {
249 $tmp_dirname = "$time_tmp_dirname$i";
250 $i++;
251 }
252 &util::mk_dir($tmp_dirname);
253
254 return $tmp_dirname;
255}
256
257sub get_timestamped_tmp_filename_in_collection
258{
259
260 my ($input_filename, $output_ext) = @_;
261 # derive tmp filename from input filename
262 my ($tailname, $dirname, $suffix)
263 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
264
265 # softlink to collection tmp dir
266 my $tmp_dirname = &util::get_timestamped_tmp_folder();
267 $tmp_dirname = $dirname unless defined $tmp_dirname;
268
269 # following two steps copied from ConvertBinaryFile
270 # do we need them?? can't use them as is, as they use plugin methods.
271
272 #$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
273
274 # URLEncode this since htmls with images where the html filename is utf8 don't seem
275 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
276 # files on the filesystem.
277 #$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
278 if (defined $output_ext) {
279 $output_ext = ".$output_ext"; # add the dot
280 } else {
281 $output_ext = $suffix;
282 }
283 $output_ext= lc($output_ext);
284 my $tmp_filename = &FileUtils::filenameConcatenate($tmp_dirname, "$tailname$output_ext");
285
286 return $tmp_filename;
287}
288
289sub get_toplevel_tmp_dir
290{
291 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "tmp");
292}
293
294
295sub filename_to_regex {
296 my $filename = shift (@_);
297
298 # need to make single backslashes double so that regex works
299 $filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
300
301 # note that the first part of a substitution is a regex, so RE chars need to be escaped,
302 # the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
303 $filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
304 $filename =~ s@\(@\\(@g; # escape brackets
305 $filename =~ s@\)@\\)@g; # escape brackets
306 $filename =~ s@\[@\\[@g; # escape brackets
307 $filename =~ s@\]@\\]@g; # escape brackets
308
309 return $filename;
310}
311
312sub unregex_filename {
313 my $filename = shift (@_);
314
315 # need to put doubled backslashes for regex back to single
316 $filename =~ s/\\\./\./g; # remove RE syntax for .
317 $filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
318 $filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
319 $filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
320 $filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
321
322 # \\ goes to \
323 # This is the last step in reverse mirroring the order of steps in filename_to_regex()
324 $filename =~ s/\\\\/\\/g; # remove RE syntax for \
325 return $filename;
326}
327
328sub filename_cat {
329 # I've disabled this warning for now, as every Greenstone perl
330 # script seems to make use of this function and so you drown in a
331 # sea of deprecated warnings [jmt12]
332# warnings::warnif("deprecated", "util::filename_cat() is deprecated, using FileUtils::filenameConcatenate() instead");
333 return &FileUtils::filenameConcatenate(@_);
334}
335
336
337sub pathname_cat {
338 my $first_path = shift(@_);
339 my (@pathnames) = @_;
340
341 # If first_path is not null or empty, then add it back into the list
342 if (defined $first_path && $first_path =~ /\S/) {
343 unshift(@pathnames, $first_path);
344 }
345
346 my $join_char;
347 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
348 $join_char = ";";
349 } else {
350 $join_char = ":";
351 }
352
353 my $pathname = join($join_char, @pathnames);
354
355 # remove duplicate slashes
356 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
357 $pathname =~ s/[\\\/]+/\\/g;
358 } else {
359 $pathname =~ s/[\/]+/\//g;
360 # DB: want a pathname abc\de.html to remain like this
361 }
362
363 return $pathname;
364}
365
366
367sub tidy_up_oid {
368 my ($OID) = @_;
369 if ($OID =~ /\./) {
370 print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
371 $OID =~ s/\.//g; #remove any periods
372 }
373 if ($OID =~ /^\s.*\s$/) {
374 print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
375 # remove starting and trailing whitespace
376 $OID =~ s/^\s+//;
377 $OID =~ s/\s+$//;
378 }
379 if ($OID =~ /^[\d]*$/) {
380 print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
381 $OID = "D" . $OID;
382 }
383
384 return $OID;
385}
386
387sub envvar_prepend {
388 my ($var,$val) = @_;
389
390 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
391 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
392
393 # do not prepend any value/path that's already in the environment variable
394
395 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
396 if (!defined($ENV{$var})) {
397 $ENV{$var} = "$val";
398 }
399 elsif($ENV{$var} !~ m/$escaped_val/) {
400 $ENV{$var} = "$val".$pathsep.$ENV{$var};
401 }
402}
403
404sub envvar_append {
405 my ($var,$val) = @_;
406
407 # 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
408 my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
409
410 # do not append any value/path that's already in the environment variable
411
412 my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
413 if (!defined($ENV{$var})) {
414 $ENV{$var} = "$val";
415 }
416 elsif($ENV{$var} !~ m/$escaped_val/) {
417 $ENV{$var} = $ENV{$var}.$pathsep."$val";
418 }
419}
420
421
422# splits a filename into a prefix and a tail extension using the tail_re, or
423# if that fails, splits on the file_extension . (dot)
424sub get_prefix_and_tail_by_regex {
425
426 my ($filename,$tail_re) = @_;
427
428 my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
429 if ((!defined $file_prefix) || (!defined $file_ext)) {
430 ($file_prefix,$file_ext) = ($filename =~ m/^(.*)(\..*?)$/);
431 }
432
433 return ($file_prefix,$file_ext);
434}
435
436# get full path and file only path from a base_dir (which may be empty) and
437# file (which may contain directories)
438sub get_full_filenames {
439 my ($base_dir, $file) = @_;
440
441 my $filename_full_path = $file;
442 # add on directory if present
443 $filename_full_path = &FileUtils::filenameConcatenate($base_dir, $file) if $base_dir =~ /\S/;
444
445 my $filename_no_path = $file;
446
447 # remove directory if present
448 $filename_no_path =~ s/^.*[\/\\]//;
449 return ($filename_full_path, $filename_no_path);
450}
451
452# returns the path of a file without the filename -- ie. the directory the file is in
453sub filename_head {
454 my $filename = shift(@_);
455
456 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
457 $filename =~ s/[^\\\\]*$//;
458 }
459 else {
460 $filename =~ s/[^\\\/]*$//;
461 }
462
463 return $filename;
464}
465
466
467
468# returns 1 if filename1 and filename2 point to the same
469# file or directory
470sub filenames_equal {
471 my ($filename1, $filename2) = @_;
472
473 # use filename_cat to clean up trailing slashes and
474 # multiple slashes
475 $filename1 = &FileUtils::filenameConcatenate($filename1);
476 $filename2 = &FileUtils::filenameConcatenate($filename2);
477
478 # filenames not case sensitive on windows
479 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
480 $filename1 =~ tr/[A-Z]/[a-z]/;
481 $filename2 =~ tr/[A-Z]/[a-z]/;
482 }
483 return 1 if $filename1 eq $filename2;
484 return 0;
485}
486
487# If filename is relative to within_dir, returns the relative path of filename to that directory
488# with slashes in the filename returned as they were in the original (absolute) filename.
489sub filename_within_directory
490{
491 my ($filename,$within_dir) = @_;
492
493 if ($within_dir !~ m/[\/\\]$/) {
494 my $dirsep = &util::get_dirsep();
495 $within_dir .= $dirsep;
496 }
497
498 $within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
499 if ($filename =~ m/^$within_dir(.*)$/) {
500 $filename = $1;
501 }
502
503 return $filename;
504}
505
506# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
507# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
508# The subpath returned will also be a URL type filename.
509sub filename_within_directory_url_format
510{
511 my ($filename,$within_dir) = @_;
512
513 # convert parameters only to / slashes if Windows
514
515 my $filename_urlformat = &filepath_to_url_format($filename);
516 my $within_dir_urlformat = &filepath_to_url_format($within_dir);
517
518 #if ($within_dir_urlformat !~ m/\/$/) {
519 # make sure directory ends with a slash
520 #$within_dir_urlformat .= "/";
521 #}
522
523 my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
524
525 #print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
526
527 # dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
528 if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)*(.*)$/) {
529 $filename_urlformat = $1;
530 }
531
532 return $filename_urlformat;
533}
534
535# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
536# since on Linux it doesn't represent a file separator but an escape char).
537sub filepath_to_url_format
538{
539 my ($filepath) = @_;
540 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
541 # Only need to worry about Windows, as Unix style directories already in url-format
542 # Convert Windows style \ => /
543 $filepath =~ s@\\@/@g;
544 }
545 return $filepath;
546}
547
548# regex filepaths on windows may include \\ as path separator. Convert \\ to /
549sub filepath_regex_to_url_format
550{
551 my ($filepath) = @_;
552 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
553 # Only need to worry about Windows, as Unix style directories already in url-format
554 # Convert Windows style \\ => /
555 $filepath =~ s@\\\\@/@g;
556 }
557 return $filepath;
558
559}
560
561# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
562# and ignores trailing /
563# returns (file, dirs) dirs will be empty if no subdirs
564sub url_fileparse
565{
566 my ($filepath) = @_;
567 # remove trailing /
568 $filepath =~ s@/$@@;
569 if ($filepath !~ m@/@) {
570 return ($filepath, "");
571 }
572 my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
573 return ($file, $dirs);
574
575}
576
577
578sub filename_within_collection
579{
580 my ($filename) = @_;
581
582 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
583
584 if (defined $collect_dir) {
585
586 # if from within GSDLCOLLECTDIR, then remove directory prefix
587 # so source_filename is realative to it. This is done to aid
588 # portability, i.e. the collection can be moved to somewhere
589 # else on the file system and the archives directory will still
590 # work. This is needed, for example in the applet version of
591 # GLI where GSDLHOME/collect on the server will be different to
592 # the collect directory of the remove user. Of course,
593 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
594 # it back into a full pathname.
595
596 $filename = filename_within_directory($filename,$collect_dir);
597 }
598
599 return $filename;
600}
601
602sub prettyprint_file
603{
604 my ($base_dir,$file,$gli) = @_;
605
606 my $filename_full_path = &FileUtils::filenameConcatenate($base_dir,$file);
607
608 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
609 require Win32;
610
611 # For some reason base_dir in the form c:/a/b/c
612 # This leads to confusion later on, so turn it back into
613 # the more usual Windows form
614 $base_dir =~ s/\//\\/g;
615 my $long_base_dir = Win32::GetLongPathName($base_dir);
616 my $long_full_path = Win32::GetLongPathName($filename_full_path);
617
618 $file = filename_within_directory($long_full_path,$long_base_dir);
619 $file = encode("utf8",$file) if ($gli);
620 }
621
622 return $file;
623}
624
625
626sub upgrade_if_dos_filename
627{
628 my ($filename_full_path,$and_encode) = @_;
629
630 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
631 # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
632 # to its long (Windows) version
633 my $long_filename = Win32::GetLongPathName($filename_full_path);
634 if (defined $long_filename) {
635 $filename_full_path = $long_filename;
636 }
637 # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
638 $filename_full_path =~ s/^(.):/\u$1:/;
639 if ((defined $and_encode) && ($and_encode)) {
640 $filename_full_path = encode("utf8",$filename_full_path);
641 }
642 }
643
644 return $filename_full_path;
645}
646
647
648sub downgrade_if_dos_filename
649{
650 my ($filename_full_path) = @_;
651
652 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
653 require Win32;
654
655 # Ensure the given long Windows filename is in a form that can
656 # be opened by Perl => convert it to a short DOS-like filename
657
658 my $short_filename = Win32::GetShortPathName($filename_full_path);
659 if (defined $short_filename) {
660 $filename_full_path = $short_filename;
661 }
662 # Make sure initial drive letter is lower-case (to fit in
663 # with rest of Greenstone)
664 $filename_full_path =~ s/^(.):/\u$1:/;
665 }
666
667 return $filename_full_path;
668}
669
670sub block_filename
671{
672 my ($block_hash,$filename) = @_;
673
674 if ($ENV{'GSDLOS'} =~ m/^windows$/) {
675
676 # lower case the entire thing, eg for cover.jpg when its actually cover.JPG
677 my $lower_filename = lc($filename);
678 $block_hash->{'file_blocks'}->{$lower_filename} = 1;
679# my $lower_drive = $filename;
680# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
681
682# my $upper_drive = $filename;
683# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
684#
685# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
686# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
687 }
688 else {
689 $block_hash->{'file_blocks'}->{$filename} = 1;
690 }
691}
692
693
694sub filename_is_absolute
695{
696 warnings::warnif("deprecated", "util::filename_is_absolute() is deprecated, using FileUtils::isFilenameAbsolute() instead");
697 return &FileUtils::isFilenameAbsolute(@_);
698}
699
700
701## @method make_absolute()
702#
703# Ensure the given file path is absolute in respect to the given base path.
704#
705# @param $base_dir A string denoting the base path the given dir must be
706# absolute to.
707# @param $dir The directory to be made absolute as a string. Note that the
708# dir may already be absolute, in which case it will remain
709# unchanged.
710# @return The now absolute form of the directory as a string.
711#
712# @author John Thompson, DL Consulting Ltd.
713# @copy 2006 DL Consulting Ltd.
714#
715#used in buildcol.pl, doesn't work for all cases --kjdon
716sub make_absolute {
717
718 my ($base_dir, $dir) = @_;
719### print STDERR "dir = $dir\n";
720 $dir =~ s/[\\\/]+/\//g;
721 $dir = $base_dir . "/$dir" unless ($dir =~ m|^(\w:)?/|);
722 $dir =~ s|^/tmp_mnt||;
723 1 while($dir =~ s|/[^/]*/\.\./|/|g);
724 $dir =~ s|/[.][.]?/|/|g;
725 $dir =~ tr|/|/|s;
726### print STDERR "dir = $dir\n";
727
728 return $dir;
729}
730## make_absolute() ##
731
732sub get_dirsep {
733
734 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
735 return "\\";
736 } else {
737 return "\/";
738 }
739}
740
741sub get_os_dirsep {
742
743 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
744 return "\\\\";
745 } else {
746 return "\\\/";
747 }
748}
749
750sub get_re_dirsep {
751
752 return "\\\\|\\\/";
753}
754
755
756sub get_dirsep_tail {
757 my ($filename) = @_;
758
759 # returns last part of directory or filename
760 # On unix e.g. a/b.d => b.d
761 # a/b/c => c
762
763 my $dirsep = get_re_dirsep();
764 my @dirs = split (/$dirsep/, $filename);
765 my $tail = pop @dirs;
766
767 # - caused problems under windows
768 #my ($tail) = ($filename =~ m/^(?:.*?$dirsep)?(.*?)$/);
769
770 return $tail;
771}
772
773
774# if this is running on windows we want binaries to end in
775# .exe, otherwise they don't have to end in any extension
776sub get_os_exe {
777 return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
778 return "";
779}
780
781
782# test to see whether this is a big or little endian machine
783sub is_little_endian
784{
785 # To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
786 # If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
787 # Otherwise, it's little endian
788
789 #return 0 if $^O =~ /^darwin$/i;
790 #return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
791
792 # Going back to stating exactly whether the machine is little endian
793 # or big endian, without any special case for Macs. Since for rata it comes
794 # back with little endian and for shuttle with bigendian.
795 return (ord(substr(pack("s",1), 0, 1)) == 1);
796}
797
798
799# will return the collection name if successful, "" otherwise
800sub use_collection {
801 my ($collection, $collectdir) = @_;
802
803 if (!defined $collectdir || $collectdir eq "") {
804 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "collect");
805 }
806
807 # get and check the collection
808 if (!defined($collection) || $collection eq "") {
809 if (defined $ENV{'GSDLCOLLECTION'}) {
810 $collection = $ENV{'GSDLCOLLECTION'};
811 } else {
812 print STDOUT "No collection specified\n";
813 return "";
814 }
815 }
816
817 if ($collection eq "modelcol") {
818 print STDOUT "You can't use modelcol.\n";
819 return "";
820 }
821
822 # make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
823 # are defined
824 $ENV{'GSDLCOLLECTION'} = $collection;
825 $ENV{'GSDLCOLLECTDIR'} = &FileUtils::filenameConcatenate($collectdir, $collection);
826
827 # make sure this collection exists
828 if (!-e $ENV{'GSDLCOLLECTDIR'}) {
829 print STDOUT "Invalid collection ($collection).\n";
830 return "";
831 }
832
833 # everything is ready to go
834 return $collection;
835}
836
837sub get_current_collection_name {
838 return $ENV{'GSDLCOLLECTION'};
839}
840
841
842# will return the collection name if successful, "" otherwise.
843# Like use_collection (above) but for greenstone 3 (taking account of site level)
844
845sub use_site_collection {
846 my ($site, $collection, $collectdir) = @_;
847
848 if (!defined $collectdir || $collectdir eq "") {
849 die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
850 $collectdir = &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'}, "sites", $site, "collect");
851 }
852
853 # collectdir explicitly set by this point (using $site variable if required).
854 # Can call "old" gsdl2 use_collection now.
855
856 return use_collection($collection,$collectdir);
857}
858
859
860
861sub locate_config_file
862{
863 my ($file) = @_;
864
865 my $locations = locate_config_files($file);
866
867 return shift @$locations; # returns undef if 'locations' is empty
868}
869
870
871sub locate_config_files
872{
873 my ($file) = @_;
874
875 my @locations = ();
876
877 if (-e $file) {
878 # Clearly specified (most likely full filename)
879 # No need to hunt in 'etc' directories, return value unchanged
880 push(@locations,$file);
881 }
882 else {
883 # Check for collection specific one before looking in global GSDL 'etc'
884 if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
885 my $test_collect_etc_filename
886 = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},"etc", $file);
887
888 if (-e $test_collect_etc_filename) {
889 push(@locations,$test_collect_etc_filename);
890 }
891 }
892 my $test_main_etc_filename
893 = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"etc", $file);
894 if (-e $test_main_etc_filename) {
895 push(@locations,$test_main_etc_filename);
896 }
897 }
898
899 return \@locations;
900}
901
902
903sub hyperlink_text
904{
905 my ($text) = @_;
906
907 $text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
908 $text =~ s/(^|\s+)(www\.(\w|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
909
910 return $text;
911}
912
913
914# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
915# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
916sub is_dir_empty {
917 warnings::warnif("deprecated", "util::is_dir_empty() is deprecated, using FileUtils::isDirectoryEmpty() instead");
918 return &FileUtils::isDirectoryEmpty(@_);
919}
920
921# Returns the given filename converted using either URL encoding or base64
922# encoding, as specified by $rename_method. If the given filename has no suffix
923# (if it is just the tailname), then $no_suffix should be some defined value.
924# rename_method can be url, none, base64
925sub rename_file {
926 my ($filename, $rename_method, $no_suffix) = @_;
927
928 if(!$filename) { # undefined or empty string
929 return $filename;
930 }
931
932 if (!$rename_method) {
933 print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
934 # Debugging information
935 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
936 # print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
937 $rename_method = "url";
938 } elsif($rename_method eq "none") {
939 return $filename; # would have already been renamed
940 }
941
942 # No longer replace spaces with underscores, since underscores mess with incremental rebuild
943 ### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
944 ###$filename =~ s/ /_/g;
945
946 my ($tailname,$dirname,$suffix);
947 if($no_suffix) { # given a tailname, no suffix
948 ($tailname,$dirname) = File::Basename::fileparse($filename);
949 }
950 else {
951 ($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
952 }
953 if (!$suffix) {
954 $suffix = "";
955 }
956 # This breaks GLI matching extracted metadata to files in Enrich panel, as
957 # original is eg .JPG while gsdlsourcefilename ends up .jpg
958 # Not sure why it was done in first place...
959 #else {
960 #$suffix = lc($suffix);
961 #}
962
963 if ($rename_method eq "url") {
964 $tailname = &unicode::url_encode($tailname);
965 }
966 elsif ($rename_method eq "base64") {
967 $tailname = &unicode::base64_encode($tailname);
968 $tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
969 }
970
971 $filename = "$tailname$suffix";
972 $filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
973
974 return $filename;
975}
976
977
978# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
979sub rename_ldb_or_bdb_file {
980 my ($filename_no_ext) = @_;
981
982 my $new_filename = "$filename_no_ext.gdb";
983 return if (-f $new_filename); # if the file has the right extension, don't need to do anything
984 # try ldb
985 my $old_filename = "$filename_no_ext.ldb";
986
987 if (-f $old_filename) {
988 print STDERR "Renaming $old_filename to $new_filename\n";
989 rename ($old_filename, $new_filename)
990 || print STDERR "Rename failed: $!\n";
991 return;
992 }
993 # try bdb
994 $old_filename = "$filename_no_ext.bdb";
995 if (-f $old_filename) {
996 print STDERR "Renaming $old_filename to $new_filename\n";
997 rename ($old_filename, $new_filename)
998 || print STDERR "Rename failed: $!\n";
999 return;
1000 }
1001}
1002
1003sub os_dir() {
1004
1005 my $gsdlarch = "";
1006 if(defined $ENV{'GSDLARCH'}) {
1007 $gsdlarch = $ENV{'GSDLARCH'};
1008 }
1009 return $ENV{'GSDLOS'}.$gsdlarch;
1010}
1011
1012# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1013# By default, /greenstone3 for GS3 or /greenstone for GS2.
1014sub get_greenstone_url_prefix() {
1015 # if already set on a previous occasion, just return that
1016 # (Don't want to keep repeating this: cost of re-opening and scanning files.)
1017 return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1018
1019 my ($configfile, $urlprefix, $defaultUrlprefix);
1020 my @propertynames = ();
1021
1022 if($ENV{'GSDL3SRCHOME'}) {
1023 $defaultUrlprefix = "/greenstone3";
1024 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1025 push(@propertynames, qw/path\s*\=/);
1026 } else {
1027 $defaultUrlprefix = "/greenstone";
1028 $configfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1029 push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1030 }
1031
1032 $urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1033
1034 if(!$urlprefix) { # no values found for URL prefix, use default values
1035 $urlprefix = $defaultUrlprefix;
1036 } else {
1037 #gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1038 $urlprefix =~ s/^\///; # remove the starting slash
1039 my @dirs = split(/(\\|\/)/, $urlprefix);
1040 $urlprefix = shift(@dirs);
1041
1042 if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1043 $urlprefix = "/$urlprefix";
1044 }
1045 }
1046
1047 # set for the future
1048 $ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1049# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1050 return $urlprefix;
1051}
1052
1053
1054# Given a config file (xml or java properties file) and a list/array of regular expressions
1055# that represent property names to match on, this function will return the value for the 1st
1056# matching property name. If the return value is undefined, no matching property was found.
1057sub extract_propvalue_from_file() {
1058 my ($configfile, $propertynames) = @_;
1059
1060 my $value;
1061 unless(open(FIN, "<$configfile")) {
1062 print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1063 return $value; # not initialised
1064 }
1065
1066 # Read the entire file at once, as one single line, then close it
1067 my $filecontents;
1068 {
1069 local $/ = undef;
1070 $filecontents = <FIN>;
1071 }
1072 close(FIN);
1073
1074 foreach my $regex (@$propertynames) {
1075 ($value) = $filecontents=~ m/$regex\s*(\S*)/s; # read value of the property given by regex up to the 1st space
1076 if($value) {
1077 $value =~ s/^\"//; # remove any startquotes
1078 $value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1079 last; # found value for a matching property, break from loop
1080 }
1081 }
1082
1083 return $value;
1084}
1085
1086# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1087# given that perllib is in @INC in order to invoke this subroutine.
1088# Call as follows -- after setting up INC to include perllib and
1089# after setting up GSDLHOME and GSDLOS:
1090#
1091# require util;
1092# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1093#
1094sub setup_greenstone_env() {
1095 my ($GSDLHOME, $GSDLOS) = @_;
1096
1097 #my %env_map = ();
1098 # Get the localised ENV settings of running a localised source setup.bash
1099 # and put it into the ENV here. Need to clear GSDLHOME before running setup
1100 #my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1101 my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1102 if($GSDLOS =~ m/windows/i) {
1103 #$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1104 $perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1105 }
1106 if (!open(PIN, "$perl_command |")) {
1107 print STDERR ("Unable to execute command: $perl_command. $!\n");
1108 }
1109
1110 while (defined (my $perl_output_line = <PIN>)) {
1111 my($key,$value) = ($perl_output_line =~ m/^([^=]*)[=](.*)$/);
1112 #$env_map{$key}=$value;
1113 $ENV{$key}=$value;
1114 }
1115 close (PIN);
1116
1117 # If any keys in $ENV don't occur in Greenstone's localised env
1118 # (stored in $env_map), delete those entries from $ENV
1119 #foreach $key (keys %ENV) {
1120 # if(!defined $env_map{$key}) {
1121 # print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1122 # delete $ENV{$key}; # del $ENV(key, value) pair
1123 # }
1124 #}
1125 #undef %env_map;
1126}
1127
1128sub get_perl_exec() {
1129 my $perl_exec = $^X; # may return just "perl"
1130
1131 if($ENV{'PERLPATH'}) {
1132 # OR: # $perl_exec = &FileUtils::filenameConcatenate($ENV{'PERLPATH'},"perl");
1133 if($ENV{'GSDLOS'} =~ m/windows/) {
1134 $perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1135 } else {
1136 $perl_exec = "$ENV{'PERLPATH'}/perl";
1137 }
1138 } else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1139 # containing the full path to the current perl executable we're using
1140 $perl_exec = $Config{perlpath}; # configured path for perl
1141 if (!-e $perl_exec) { # may not point to location on this machine
1142 $perl_exec = $^X; # may return just "perl"
1143 if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1144 print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1145 }
1146 }
1147 }
1148
1149 return $perl_exec;
1150}
1151
1152# returns the path to the java command in the JRE included with GS (if any),
1153# quoted to safeguard any spaces in this path, otherwise a simple java
1154# command is returned which assumes and will try for a system java.
1155sub get_java_command {
1156 my $java = "java";
1157 if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1158 # after running setup.bat or from GLI which also runs setup.bat
1159 my $java_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"packages","jre","bin");
1160 if(-d $java_bin) {
1161 $java = &FileUtils::filenameConcatenate($java_bin,"java");
1162 $java = "\"".$java."\""; # quoted to preserve spaces in path
1163 }
1164 }
1165 return $java;
1166}
1167
1168
1169# Given the qualified collection name (colgroup/collection),
1170# returns the collection and colgroup parts
1171sub get_collection_parts {
1172 # http://perldoc.perl.org/File/Basename.html
1173 # my($filename, $directories, $suffix) = fileparse($path);
1174 # "$directories contains everything up to and including the last directory separator in the $path
1175 # including the volume (if applicable). The remainder of the $path is the $filename."
1176 #my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1177
1178 my $qualified_collection = shift(@_);
1179
1180 # Since activate.pl can be launched from the command-line, including by a user,
1181 # best not to assume colgroup uses URL-style slashes as would be the case with GLI
1182 # Also allow for the accidental inclusion of multiple slashes
1183 my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1184
1185 if(!defined $collection) {
1186 $collection = $colgroup;
1187 $colgroup = "";
1188 }
1189 return ($collection, $colgroup);
1190}
1191
1192# work out the "collectdir/collection" location
1193sub resolve_collection_dir {
1194 my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1195
1196 my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1197
1198 if (defined $collect_dir && $collect_dir) { # ensure not empty string either
1199 return &FileUtils::filenameConcatenate($collect_dir,$colgroup, $collection);
1200 }
1201 elsif (defined($ENV{'GSDLCOLLECTDIR'})) {
1202 return $ENV{'GSDLCOLLECTDIR'};
1203 }
1204 else {
1205 if (defined $site) {
1206 return &FileUtils::filenameConcatenate($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
1207 }
1208 else {
1209 return &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
1210 }
1211 }
1212}
1213
1214# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1215# a directory containing sequentially numbered images.
1216sub create_itemfile
1217{
1218 my ($output_dir, $convert_basename, $convert_to) = @_;
1219 my $page_num = "";
1220
1221 opendir(DIR, $output_dir) || die "can't opendir $output_dir: $!";
1222 my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1223 closedir DIR;
1224
1225 # Sort files in the directory by page_num
1226 sub page_number {
1227 my ($dir) = @_;
1228 my ($pagenum) =($dir =~ m/^.*[-\.]?(\d+)(\.(jpg|gif|png))?$/i);
1229
1230 $pagenum = 1 unless defined $pagenum;
1231 return $pagenum;
1232 }
1233
1234 # sort the files in the directory in the order of page_num rather than lexically.
1235 @dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1236
1237 # work out if the numbering of the now sorted image files starts at 0 or not
1238 # by checking the number of the first _image_ file (skipping item files)
1239 my $starts_at_0 = 0;
1240 my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1241 if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1242 $starts_at_0 = 1;
1243 }
1244
1245 my $item_file = &FileUtils::filenameConcatenate($output_dir, $convert_basename.".item");
1246 my $item_fh;
1247 &FileUtils::openFileHandle($item_file, 'w', \$item_fh);
1248 print $item_fh "<PagedDocument>\n";
1249
1250 foreach my $file (@dir_files){
1251 if ($file !~ /\.item/i){
1252 $page_num = page_number($file);
1253 $page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1254 print $item_fh " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1255 }
1256 }
1257
1258 print $item_fh "</PagedDocument>\n";
1259 &FileUtils::closeFileHandle($item_file, \$item_fh);
1260 return $item_file;
1261}
1262
1263
1264# /** @function augmentINC()
1265# * Prepend a path (if it exists) onto INC but only if it isn't already in
1266# * INC
1267# * @param $new_path The path to add as necessary
1268# * @author jmt12
1269# */
1270sub augmentINC
1271{
1272 my ($new_path) = @_;
1273 my $did_add_path = 0;
1274 # will need to be replaced with FileUtils::directoryExists() call eventually
1275 if (-d $new_path)
1276 {
1277 my $did_find_path = 0;
1278 foreach my $existing_path (@INC)
1279 {
1280 if ($existing_path eq $new_path)
1281 {
1282 $did_find_path = 1;
1283 last;
1284 }
1285 }
1286 if (!$did_find_path)
1287 {
1288 unshift(@INC, $new_path);
1289 $did_add_path = 1;
1290 }
1291 }
1292 return $did_add_path;
1293}
1294# /** augmentINC($new_path) **/
1295
12961;
Note: See TracBrowser for help on using the repository browser.