Context Navigation

source: main/trunk/greenstone2/perllib/util.pm@ 26973

Last change on this file since 26973 was 26973, checked in by kjdon, 11 years ago
don't lowercase the suffix in rename_file - if the original was uppercase, eg .JPG then it won't match .jpg when GLI reads through archive files to find metadata. David can't remember why he did it in the first place. I think there must have been a reason, so maybe it will show up again one day...
Property svn:keywords set to `Author Date Id Revision`
File size: 54.9 KB

Line
1	###########################################################################
2	#
3	# util.pm -- various useful utilities
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package util;
27
28	use strict;
29
30	use Encode;
31	use File::Copy;
32	use File::Basename;
33	# Config for getting the perlpath in the recommended way, though it uses paths that are
34	# hard-coded into the Config file that's generated upon configuring and compiling perl.
35	# $^X works better in some cases to return the path to perl used to launch the script,
36	# but if launched with plain "perl" (no full-path), that will be just what it returns.
37	use Config;
38
39	# removes files (but not directories)
40	sub rm {
41	my (@files) = @_;
42
43	my @filefiles = ();
44
45	# make sure the files we want to delete exist
46	# and are regular files
47	foreach my $file (@files) {
48	if (!-e $file) {
49	print STDERR "util::rm $file does not exist\n";
50	} elsif ((!-f $file) && (!-l $file)) {
51	print STDERR "util::rm $file is not a regular (or symbolic) file\n";
52	} else {
53	push (@filefiles, $file);
54	}
55	}
56
57	# remove the files
58	my $numremoved = unlink @filefiles;
59
60	# check to make sure all of them were removed
61	if ($numremoved != scalar(@filefiles)) {
62	print STDERR "util::rm Not all files were removed\n";
63	}
64	}
65
66	# removes files (but not directories) - can rename this to the default
67	# "rm" subroutine when debugging the deletion of individual files.
68	sub rm_debug {
69	my (@files) = @_;
70	my @filefiles = ();
71
72	# make sure the files we want to delete exist
73	# and are regular files
74	foreach my $file (@files) {
75	if (!-e $file) {
76	print STDERR "util::rm $file does not exist\n";
77	} elsif ((!-f $file) && (!-l $file)) {
78	print STDERR "util::rm $file is not a regular (or symbolic) file\n";
79	} else { # debug message
80	unlink($file) or warn "Could not delete file $file: $!\n";
81	}
82	}
83	}
84
85
86	# recursive removal
87	sub filtered_rm_r {
88	my ($files,$file_accept_re,$file_reject_re) = @_;
89
90	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(2);
91	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
92	# print STDERR "** Calling method (2): $lcfilename:$cline $cpackage->$csubr\n";
93
94	my @files_array = (ref $files eq "ARRAY") ? @$files : ($files);
95
96	# recursively remove the files
97	foreach my $file (@files_array) {
98	$file =~ s/[\/\\]+$//; # remove trailing slashes
99
100	if (!-e $file) {
101	print STDERR "util::filtered_rm_r $file does not exist\n";
102
103	} elsif ((-d $file) && (!-l $file)) { # don't recurse down symbolic link
104	# get the contents of this directory
105	if (!opendir (INDIR, $file)) {
106	print STDERR "util::filtered_rm_r could not open directory $file\n";
107	} else {
108	my @filedir = grep (!/^\.\.?$/, readdir (INDIR));
109	closedir (INDIR);
110
111	# remove all the files in this directory
112	map {$_="$file/$_";} @filedir;
113	&filtered_rm_r (\@filedir,$file_accept_re,$file_reject_re);
114
115	if (!defined $file_accept_re && !defined $file_reject_re) {
116	# remove this directory
117	if (!rmdir $file) {
118	print STDERR "util::filtered_rm_r couldn't remove directory $file\n";
119	}
120	}
121	}
122	} else {
123	next if (defined $file_reject_re && ($file =~ m/$file_reject_re/));
124
125	if ((!defined $file_accept_re) \|\| ($file =~ m/$file_accept_re/)) {
126	# remove this file
127	&rm ($file);
128	}
129	}
130	}
131	}
132
133
134	# recursive removal
135	sub rm_r {
136	my (@files) = @_;
137
138	# use the more general (but reterospectively written function
139	# filtered_rm_r function()
140
141	filtered_rm_r(\@files,undef,undef); # no accept or reject expressions
142	}
143
144
145
146
147	# moves a file or a group of files
148	sub mv {
149	my $dest = pop (@_);
150	my (@srcfiles) = @_;
151
152	# remove trailing slashes from source and destination files
153	$dest =~ s/[\\\/]+$//;
154	map {$_ =~ s/[\\\/]+$//;} @srcfiles;
155
156	# a few sanity checks
157	if (scalar (@srcfiles) == 0) {
158	print STDERR "util::mv no destination directory given\n";
159	return;
160	} elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
161	print STDERR "util::mv if multiple source files are given the ".
162	"destination must be a directory\n";
163	return;
164	}
165
166	# move the files
167	foreach my $file (@srcfiles) {
168	my $tempdest = $dest;
169	if (-d $tempdest) {
170	my ($filename) = $file =~ /([^\\\/]+)$/;
171	$tempdest .= "/$filename";
172	}
173	if (!-e $file) {
174	print STDERR "util::mv $file does not exist\n";
175	} else {
176	if(!rename ($file, $tempdest)) {
177	print STDERR "**** Failed to rename $file to $tempdest\n";
178	&File::Copy::copy($file, $tempdest);
179	&rm($file);
180	}
181	elsif(-e $file) { # rename (partially) succeeded) but srcfile still exists after rename
182	#print STDERR "*** srcfile $file still exists after rename to $tempdest\n";
183	if(!-e $tempdest) {
184	print STDERR "@@@@ ERROR: $tempdest does not exist\n";
185	}
186	# Sometimes the rename operation fails (as does File::Copy::move).
187	# This turns out to be because the files are hardlinked.
188	# Need to do a copy-delete in this case, however, the copy step is not necessary:
189	# the srcfile got renamed into tempdest, but srcfile itself still exists, delete it.
190	#&File::Copy::copy($file, $tempdest);
191
192	&rm($file);
193	}
194	}
195	}
196	}
197
198	# Move the contents of source directory into target directory
199	# (as opposed to merely replacing target dir with the src dir)
200	# This can overwrite any files with duplicate names in the target
201	# but other files and folders in the target will continue to exist
202	sub mv_dir_contents {
203	my ($src_dir, $dest_dir) = @_;
204
205	# Obtain listing of all files within src_dir
206	# Note that readdir lists relative paths, as well as . and ..
207	opendir(DIR, "$src_dir");
208	my @files= readdir(DIR);
209	close(DIR);
210
211	my @full_path_files = ();
212	foreach my $file (@files) {
213	# process all except . and ..
214	unless($file eq "." \|\| $file eq "..") {
215
216	my $dest_subdir = &filename_cat($dest_dir, $file); # $file is still a relative path
217
218	# construct absolute paths
219	$file = &filename_cat($src_dir, $file); # $file is now an absolute path
220
221	# Recurse on directories which have an equivalent in target dest_dir
222	# If $file is a directory that already exists in target $dest_dir,
223	# then a simple move operation will fail (definitely on Windows).
224	if(-d $file && -d $dest_subdir) {
225	#print STDERR "**** $file is a directory also existing in target, its contents to be copied to $dest_subdir\n";
226	&mv_dir_contents($file, $dest_subdir);
227
228	# now all content is moved across, delete empty dir in source folder
229	if(&is_dir_empty($file)) {
230	if (!rmdir $file) {
231	print STDERR "ERROR. util::mv_dir_contents couldn't remove directory $file\n";
232	}
233	} else { # error
234	print STDERR "ERROR. util::mv_dir_contents: subfolder $file still non-empty after moving contents to $dest_subdir\n";
235	}
236	} else { # process files and any directories that don't already exist with a simple move
237	push(@full_path_files, $file);
238	}
239	}
240	}
241
242	if(!&dir_exists($dest_dir)) { # create target toplevel folder or subfolders if they don't exist
243	&mk_dir($dest_dir);
244	}
245
246	#print STDERR "@@@@@ Copying files \|".join(",", @full_path_files)."\| to: $dest_dir\n";
247
248	if(@full_path_files) { # if non-empty, there's something to copy across
249	&mv(@full_path_files, $dest_dir);
250	}
251	}
252
253
254	# copies a file or a group of files
255	sub cp {
256	my $dest = pop (@_);
257	my (@srcfiles) = @_;
258
259	# remove trailing slashes from source and destination files
260	$dest =~ s/[\\\/]+$//;
261	map {$_ =~ s/[\\\/]+$//;} @srcfiles;
262
263	# a few sanity checks
264	if (scalar (@srcfiles) == 0) {
265	print STDERR "util::cp no destination directory given\n";
266	return;
267	} elsif ((scalar (@srcfiles) > 1) && (!-d $dest)) {
268	print STDERR "util::cp if multiple source files are given the ".
269	"destination must be a directory\n";
270	return;
271	}
272
273	# copy the files
274	foreach my $file (@srcfiles) {
275	my $tempdest = $dest;
276	if (-d $tempdest) {
277	my ($filename) = $file =~ /([^\\\/]+)$/;
278	$tempdest .= "/$filename";
279	}
280	if (!-e $file) {
281	print STDERR "util::cp $file does not exist\n";
282	} elsif (!-f $file) {
283	print STDERR "util::cp $file is not a plain file\n";
284	} else {
285	&File::Copy::copy ($file, $tempdest);
286	}
287	}
288	}
289
290
291
292	# recursively copies a file or group of files
293	# syntax: cp_r (sourcefiles, destination directory)
294	# destination must be a directory - to copy one file to
295	# another use cp instead
296	sub cp_r {
297	my $dest = pop (@_);
298	my (@srcfiles) = @_;
299
300	# a few sanity checks
301	if (scalar (@srcfiles) == 0) {
302	print STDERR "util::cp_r no destination directory given\n";
303	return;
304	} elsif (-f $dest) {
305	print STDERR "util::cp_r destination must be a directory\n";
306	return;
307	}
308
309	# create destination directory if it doesn't exist already
310	if (! -d $dest) {
311	my $store_umask = umask(0002);
312	mkdir ($dest, 0777);
313	umask($store_umask);
314	}
315
316	# copy the files
317	foreach my $file (@srcfiles) {
318
319	if (!-e $file) {
320	print STDERR "util::cp_r $file does not exist\n";
321
322	} elsif (-d $file) {
323	# make the new directory
324	my ($filename) = $file =~ /([^\\\/]*)$/;
325	$dest = &util::filename_cat ($dest, $filename);
326	my $store_umask = umask(0002);
327	mkdir ($dest, 0777);
328	umask($store_umask);
329
330	# get the contents of this directory
331	if (!opendir (INDIR, $file)) {
332	print STDERR "util::cp_r could not open directory $file\n";
333	} else {
334	my @filedir = readdir (INDIR);
335	closedir (INDIR);
336	foreach my $f (@filedir) {
337	next if $f =~ /^\.\.?$/;
338	# copy all the files in this directory
339	my $ff = &util::filename_cat ($file, $f);
340	&cp_r ($ff, $dest);
341	}
342	}
343
344	} else {
345	&cp($file, $dest);
346	}
347	}
348	}
349	# recursively copies a file or group of files
350	# syntax: cp_r (sourcefiles, destination directory)
351	# destination must be a directory - to copy one file to
352	# another use cp instead
353	sub cp_r_nosvn {
354	my $dest = pop (@_);
355	my (@srcfiles) = @_;
356
357	# a few sanity checks
358	if (scalar (@srcfiles) == 0) {
359	print STDERR "util::cp_r no destination directory given\n";
360	return;
361	} elsif (-f $dest) {
362	print STDERR "util::cp_r destination must be a directory\n";
363	return;
364	}
365
366	# create destination directory if it doesn't exist already
367	if (! -d $dest) {
368	my $store_umask = umask(0002);
369	mkdir ($dest, 0777);
370	umask($store_umask);
371	}
372
373	# copy the files
374	foreach my $file (@srcfiles) {
375
376	if (!-e $file) {
377	print STDERR "util::cp_r $file does not exist\n";
378
379	} elsif (-d $file) {
380	# make the new directory
381	my ($filename) = $file =~ /([^\\\/]*)$/;
382	$dest = &util::filename_cat ($dest, $filename);
383	my $store_umask = umask(0002);
384	mkdir ($dest, 0777);
385	umask($store_umask);
386
387	# get the contents of this directory
388	if (!opendir (INDIR, $file)) {
389	print STDERR "util::cp_r could not open directory $file\n";
390	} else {
391	my @filedir = readdir (INDIR);
392	closedir (INDIR);
393	foreach my $f (@filedir) {
394	next if $f =~ /^\.\.?$/;
395	next if $f =~ /^\.svn$/;
396	# copy all the files in this directory
397	my $ff = &util::filename_cat ($file, $f);
398	&cp_r ($ff, $dest);
399	}
400	}
401
402	} else {
403	&cp($file, $dest);
404	}
405	}
406	}
407
408	# copies a directory and its contents, excluding subdirectories, into a new directory
409	sub cp_r_toplevel {
410	my $dest = pop (@_);
411	my (@srcfiles) = @_;
412
413	# a few sanity checks
414	if (scalar (@srcfiles) == 0) {
415	print STDERR "util::cp_r no destination directory given\n";
416	return;
417	} elsif (-f $dest) {
418	print STDERR "util::cp_r destination must be a directory\n";
419	return;
420	}
421
422	# create destination directory if it doesn't exist already
423	if (! -d $dest) {
424	my $store_umask = umask(0002);
425	mkdir ($dest, 0777);
426	umask($store_umask);
427	}
428
429	# copy the files
430	foreach my $file (@srcfiles) {
431
432	if (!-e $file) {
433	print STDERR "util::cp_r $file does not exist\n";
434
435	} elsif (-d $file) {
436	# make the new directory
437	my ($filename) = $file =~ /([^\\\/]*)$/;
438	$dest = &util::filename_cat ($dest, $filename);
439	my $store_umask = umask(0002);
440	mkdir ($dest, 0777);
441	umask($store_umask);
442
443	# get the contents of this directory
444	if (!opendir (INDIR, $file)) {
445	print STDERR "util::cp_r could not open directory $file\n";
446	} else {
447	my @filedir = readdir (INDIR);
448	closedir (INDIR);
449	foreach my $f (@filedir) {
450	next if $f =~ /^\.\.?$/;
451
452	# copy all the files in this directory, but not directories
453	my $ff = &util::filename_cat ($file, $f);
454	if (-f $ff) {
455	&cp($ff, $dest);
456	#&cp_r ($ff, $dest);
457	}
458	}
459	}
460
461	} else {
462	&cp($file, $dest);
463	}
464	}
465	}
466
467	sub mk_dir {
468	my ($dir) = @_;
469
470	my $store_umask = umask(0002);
471	my $mkdir_ok = mkdir ($dir, 0777);
472	umask($store_umask);
473
474	if (!$mkdir_ok)
475	{
476	print STDERR "util::mk_dir could not create directory $dir\n";
477	return;
478	}
479	}
480
481	# in case anyone cares - I did some testing (using perls Benchmark module)
482	# on this subroutine against File::Path::mkpath (). mk_all_dir() is apparently
483	# slightly faster (surprisingly) - Stefan.
484	sub mk_all_dir {
485	my ($dir) = @_;
486
487	# use / for the directory separator, remove duplicate and
488	# trailing slashes
489	$dir=~s/[\\\/]+/\//g;
490	$dir=~s/[\\\/]+$//;
491
492	# make sure the cache directory exists
493	my $dirsofar = "";
494	my $first = 1;
495	foreach my $dirname (split ("/", $dir)) {
496	$dirsofar .= "/" unless $first;
497	$first = 0;
498
499	$dirsofar .= $dirname;
500
501	next if $dirname =~ /^(\|[a-z]:)$/i;
502	if (!-e $dirsofar)
503	{
504	my $store_umask = umask(0002);
505	my $mkdir_ok = mkdir ($dirsofar, 0777);
506	umask($store_umask);
507	if (!$mkdir_ok)
508	{
509	print STDERR "util::mk_all_dir could not create directory $dirsofar\n";
510	return;
511	}
512	}
513	}
514	}
515
516	# make hard link to file if supported by OS, otherwise copy the file
517	sub hard_link {
518	my ($src, $dest, $verbosity) = @_;
519
520	# remove trailing slashes from source and destination files
521	$src =~ s/[\\\/]+$//;
522	$dest =~ s/[\\\/]+$//;
523
524	## print STDERR "**** src = ", unicode::debug_unicode_string($src),"\n";
525	# a few sanity checks
526	if (-e $dest) {
527	# destination file already exists
528	return;
529	}
530	elsif (!-e $src) {
531	print STDERR "util::hard_link source file \"$src\" does not exist\n";
532	return 1;
533	}
534	elsif (-d $src) {
535	print STDERR "util::hard_link source \"$src\" is a directory\n";
536	return 1;
537	}
538
539	my $dest_dir = &File::Basename::dirname($dest);
540	mk_all_dir($dest_dir) if (!-e $dest_dir);
541
542
543	if (!link($src, $dest)) {
544	if ((!defined $verbosity) \|\| ($verbosity>2)) {
545	print STDERR "util::hard_link: unable to create hard link. ";
546	print STDERR " Copying file: $src -> $dest\n";
547	}
548	&File::Copy::copy ($src, $dest);
549	}
550	return 0;
551	}
552
553	# make soft link to file if supported by OS, otherwise copy file
554	sub soft_link {
555	my ($src, $dest, $ensure_paths_absolute) = @_;
556
557	# remove trailing slashes from source and destination files
558	$src =~ s/[\\\/]+$//;
559	$dest =~ s/[\\\/]+$//;
560
561	# Ensure file paths are absolute IF requested to do so
562	# Soft_linking didn't work for relative paths
563	if(defined $ensure_paths_absolute && $ensure_paths_absolute) {
564	# We need to ensure that the src file is the absolute path
565	# See http://perldoc.perl.org/File/Spec.html
566	if(!File::Spec->file_name_is_absolute( $src )) { # it's relative
567	$src = File::Spec->rel2abs($src); # make absolute
568	}
569	# Might as well ensure that the destination file's absolute path is used
570	if(!File::Spec->file_name_is_absolute( $dest )) {
571	$dest = File::Spec->rel2abs($dest); # make absolute
572	}
573	}
574
575	# a few sanity checks
576	if (!-e $src) {
577	print STDERR "util::soft_link source file $src does not exist\n";
578	return 0;
579	}
580
581	my $dest_dir = &File::Basename::dirname($dest);
582	mk_all_dir($dest_dir) if (!-e $dest_dir);
583
584	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
585
586	# symlink not supported on windows
587	&File::Copy::copy ($src, $dest);
588
589	} elsif (!eval {symlink($src, $dest)}) {
590	print STDERR "util::soft_link: unable to create soft link.\n";
591	return 0;
592	}
593
594	return 1;
595	}
596
597	# Primarily for filenames generated by processing
598	# content of HTML files (which are mapped to UTF-8 internally)
599	#
600	# To turn this into an octet string that really exists on the file
601	# system:
602	# 1. don't need to do anything special for Unix-based systems
603	# (as underlying file system is byte-code)
604	# 2. need to map to short DOS filenames for Windows
605
606	sub utf8_to_real_filename
607	{
608	my ($utf8_filename) = @_;
609
610	my $real_filename;
611
612	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
613	require Win32;
614
615	print STDERR "***** utf8 filename = $utf8_filename\n\n\n";
616
617	my $unicode_filename = decode("utf8",$utf8_filename);
618	$real_filename = Win32::GetShortPathName($unicode_filename);
619	}
620	else {
621	$real_filename = $utf8_filename;
622	}
623
624	return $real_filename;
625	}
626
627
628	sub fd_exists
629	{
630	my $filename_full_path = shift @_;
631	my $test_op = shift @_ \|\| "-e";
632
633	# By default tests for existance of file or directory (-e)
634	# Can be made more specific by providing second parameter (e.g. -f or -d)
635
636	my $exists = 0;
637
638	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
639	require Win32;
640	my $filename_short_path = Win32::GetShortPathName($filename_full_path);
641	if (!defined $filename_short_path) {
642	# Was probably still in UTF8 form (not what is needed on Windows)
643	my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
644	if (defined $unicode_filename_full_path) {
645	$filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
646	}
647	}
648	$filename_full_path = $filename_short_path;
649	}
650
651	if (defined $filename_full_path) {
652	$exists = eval "($test_op \$filename_full_path)";
653	}
654
655	return $exists;
656	}
657
658	sub file_exists
659	{
660	my ($filename_full_path) = @_;
661
662	return fd_exists($filename_full_path,"-f");
663	}
664
665	sub dir_exists
666	{
667	my ($filename_full_path) = @_;
668
669	return fd_exists($filename_full_path,"-d");
670	}
671
672
673
674	# updates a copy of a directory in some other part of the filesystem
675	# verbosity settings are: 0=low, 1=normal, 2=high
676	# both $fromdir and $todir should be absolute paths
677	sub cachedir {
678	my ($fromdir, $todir, $verbosity) = @_;
679	$verbosity = 1 unless defined $verbosity;
680
681	# use / for the directory separator, remove duplicate and
682	# trailing slashes
683	$fromdir=~s/[\\\/]+/\//g;
684	$fromdir=~s/[\\\/]+$//;
685	$todir=~s/[\\\/]+/\//g;
686	$todir=~s/[\\\/]+$//;
687
688	&mk_all_dir ($todir);
689
690	# get the directories in ascending order
691	if (!opendir (FROMDIR, $fromdir)) {
692	print STDERR "util::cachedir could not read directory $fromdir\n";
693	return;
694	}
695	my @fromdir = grep (!/^\.\.?$/, sort(readdir (FROMDIR)));
696	closedir (FROMDIR);
697
698	if (!opendir (TODIR, $todir)) {
699	print STDERR "util::cacedir could not read directory $todir\n";
700	return;
701	}
702	my @todir = grep (!/^\.\.?$/, sort(readdir (TODIR)));
703	closedir (TODIR);
704
705	my $fromi = 0;
706	my $toi = 0;
707
708	while ($fromi < scalar(@fromdir) \|\| $toi < scalar(@todir)) {
709	# print "fromi: $fromi toi: $toi\n";
710
711	# see if we should delete a file/directory
712	# this should happen if the file/directory
713	# is not in the from list or if its a different
714	# size, or has an older timestamp
715	if ($toi < scalar(@todir)) {
716	if (($fromi >= scalar(@fromdir)) \|\|
717	($todir[$toi] lt $fromdir[$fromi] \|\|
718	($todir[$toi] eq $fromdir[$fromi] &&
719	&differentfiles("$fromdir/$fromdir[$fromi]","$todir/$todir[$toi]",
720	$verbosity)))) {
721
722	# the files are different
723	&rm_r("$todir/$todir[$toi]");
724	splice(@todir, $toi, 1); # $toi stays the same
725
726	} elsif ($todir[$toi] eq $fromdir[$fromi]) {
727	# the files are the same
728	# if it is a directory, check its contents
729	if (-d "$todir/$todir[$toi]") {
730	&cachedir ("$fromdir/$fromdir[$fromi]",
731	"$todir/$todir[$toi]", $verbosity);
732	}
733
734	$toi++;
735	$fromi++;
736	next;
737	}
738	}
739
740	# see if we should insert a file/directory
741	# we should insert a file/directory if there
742	# is no tofiles left or if the tofile does not exist
743	if ($fromi < scalar(@fromdir) && ($toi >= scalar(@todir) \|\|
744	$todir[$toi] gt $fromdir[$fromi])) {
745	&cp_r ("$fromdir/$fromdir[$fromi]", "$todir/$fromdir[$fromi]");
746	splice (@todir, $toi, 0, $fromdir[$fromi]);
747
748	$toi++;
749	$fromi++;
750	}
751	}
752	}
753
754	# this function returns -1 if either file is not found
755	# assumes that $file1 and $file2 are absolute file names or
756	# in the current directory
757	# $file2 is allowed to be newer than $file1
758	sub differentfiles {
759	my ($file1, $file2, $verbosity) = @_;
760	$verbosity = 1 unless defined $verbosity;
761
762	$file1 =~ s/\/+$//;
763	$file2 =~ s/\/+$//;
764
765	my ($file1name) = $file1 =~ /\/([^\/]*)$/;
766	my ($file2name) = $file2 =~ /\/([^\/]*)$/;
767
768	return -1 unless (-e $file1 && -e $file2);
769	if ($file1name ne $file2name) {
770	print STDERR "filenames are not the same\n" if ($verbosity >= 2);
771	return 1;
772	}
773
774	my @file1stat = stat ($file1);
775	my @file2stat = stat ($file2);
776
777	if (-d $file1) {
778	if (! -d $file2) {
779	print STDERR "one file is a directory\n" if ($verbosity >= 2);
780	return 1;
781	}
782	return 0;
783	}
784
785	# both must be regular files
786	unless (-f $file1 && -f $file2) {
787	print STDERR "one file is not a regular file\n" if ($verbosity >= 2);
788	return 1;
789	}
790
791	# the size of the files must be the same
792	if ($file1stat[7] != $file2stat[7]) {
793	print STDERR "different sized files\n" if ($verbosity >= 2);
794	return 1;
795	}
796
797	# the second file cannot be older than the first
798	if ($file1stat[9] > $file2stat[9]) {
799	print STDERR "file is older\n" if ($verbosity >= 2);
800	return 1;
801	}
802
803	return 0;
804	}
805
806
807	sub get_tmp_filename
808	{
809	my $file_ext = shift(@_) \|\| undef;
810
811	my $opt_dot_file_ext = "";
812	if (defined $file_ext) {
813	if ($file_ext !~ m/\./) {
814	# no dot, so needs one added in at start
815	$opt_dot_file_ext = ".$file_ext"
816	}
817	else {
818	# allow for "extensions" such as _metadata.txt to be handled
819	# gracefully
820	$opt_dot_file_ext = $file_ext;
821	}
822	}
823
824	my $tmpdir = filename_cat($ENV{'GSDLHOME'}, "tmp");
825	&mk_all_dir ($tmpdir) unless -e $tmpdir;
826
827	my $count = 1000;
828	my $rand = int(rand $count);
829	my $full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
830
831	while (-e $full_tmp_filename) {
832	$rand = int(rand $count);
833	$full_tmp_filename = &filename_cat($tmpdir, "F$rand$opt_dot_file_ext");
834	$count++;
835	}
836
837	return $full_tmp_filename;
838	}
839
840	sub get_timestamped_tmp_folder
841	{
842
843	my $tmp_dirname;
844	if(defined $ENV{'GSDLCOLLECTDIR'}) {
845	$tmp_dirname = $ENV{'GSDLCOLLECTDIR'};
846	} elsif(defined $ENV{'GSDLHOME'}) {
847	$tmp_dirname = $ENV{'GSDLHOME'};
848	} else {
849	return undef;
850	}
851
852	$tmp_dirname = &util::filename_cat($tmp_dirname, "tmp");
853	&util::mk_dir($tmp_dirname) if (!-e $tmp_dirname);
854
855	# add the timestamp into the path otherwise we can run into problems
856	# if documents have the same name
857	my $timestamp = time;
858	my $time_tmp_dirname = &util::filename_cat($tmp_dirname, $timestamp);
859	$tmp_dirname = $time_tmp_dirname;
860	my $i = 1;
861	while (-e $tmp_dirname) {
862	$tmp_dirname = "$time_tmp_dirname$i";
863	$i++;
864	}
865	&util::mk_dir($tmp_dirname);
866
867	return $tmp_dirname;
868	}
869
870	sub get_timestamped_tmp_filename_in_collection
871	{
872
873	my ($input_filename, $output_ext) = @_;
874	# derive tmp filename from input filename
875	my ($tailname, $dirname, $suffix)
876	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
877
878	# softlink to collection tmp dir
879	my $tmp_dirname = &util::get_timestamped_tmp_folder();
880	$tmp_dirname = $dirname unless defined $tmp_dirname;
881
882	# following two steps copied from ConvertBinaryFile
883	# do we need them?? can't use them as is, as they use plugin methods.
884
885	#$tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname);
886
887	# URLEncode this since htmls with images where the html filename is utf8 don't seem
888	# to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded
889	# files on the filesystem.
890	#$tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");
891	if (defined $output_ext) {
892	$output_ext = ".$output_ext"; # add the dot
893	} else {
894	$output_ext = $suffix;
895	}
896	$output_ext= lc($output_ext);
897	my $tmp_filename = &util::filename_cat($tmp_dirname, "$tailname$output_ext");
898
899	return $tmp_filename;
900	}
901
902	sub get_toplevel_tmp_dir
903	{
904	return filename_cat($ENV{'GSDLHOME'}, "tmp");
905	}
906
907
908	sub filename_to_regex {
909	my $filename = shift (@_);
910
911	# need to make single backslashes double so that regex works
912	$filename =~ s/\\/\\\\/g; # if ($ENV{'GSDLOS'} =~ /^windows$/i);
913
914	# note that the first part of a substitution is a regex, so RE chars need to be escaped,
915	# the second part of a substitution is not a regex, so for e.g. full-stop can be specified literally
916	$filename =~ s/\./\\./g; # in case there are extensions/other full stops, escape them
917	$filename =~ s@\(@\\(@g; # escape brackets
918	$filename =~ s@\)@\\)@g; # escape brackets
919	$filename =~ s@\[@\\[@g; # escape brackets
920	$filename =~ s@\]@\\]@g; # escape brackets
921
922	return $filename;
923	}
924
925	sub unregex_filename {
926	my $filename = shift (@_);
927
928	# need to put doubled backslashes for regex back to single
929	$filename =~ s/\\\./\./g; # remove RE syntax for .
930	$filename =~ s@\\\(@(@g; # remove RE syntax for ( => "\(" turns into "("
931	$filename =~ s@\\\)@)@g; # remove RE syntax for ) => "\)" turns into ")"
932	$filename =~ s@\\\[@[@g; # remove RE syntax for [ => "\[" turns into "["
933	$filename =~ s@\\\]@]@g; # remove RE syntax for ] => "\]" turns into "]"
934
935	# \\ goes to \
936	# This is the last step in reverse mirroring the order of steps in filename_to_regex()
937	$filename =~ s/\\\\/\\/g; # remove RE syntax for \
938	return $filename;
939	}
940
941	sub filename_cat {
942	my $first_file = shift(@_);
943	my (@filenames) = @_;
944
945	# Useful for debugging
946	# -- might make sense to call caller(0) rather than (1)??
947	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
948	# print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n";
949
950	# If first_file is not null or empty, then add it back into the list
951	if (defined $first_file && $first_file =~ /\S/) {
952	unshift(@filenames, $first_file);
953	}
954
955	my $filename = join("/", @filenames);
956
957	# remove duplicate slashes and remove the last slash
958	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
959	$filename =~ s/[\\\/]+/\\/g;
960	} else {
961	$filename =~ s/[\/]+/\//g;
962	# DB: want a filename abc\de.html to remain like this
963	}
964	$filename =~ s/[\\\/]$//;
965
966	return $filename;
967	}
968
969
970	sub pathname_cat {
971	my $first_path = shift(@_);
972	my (@pathnames) = @_;
973
974	# If first_path is not null or empty, then add it back into the list
975	if (defined $first_path && $first_path =~ /\S/) {
976	unshift(@pathnames, $first_path);
977	}
978
979	my $join_char;
980	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
981	$join_char = ";";
982	} else {
983	$join_char = ":";
984	}
985
986	my $pathname = join($join_char, @pathnames);
987
988	# remove duplicate slashes
989	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
990	$pathname =~ s/[\\\/]+/\\/g;
991	} else {
992	$pathname =~ s/[\/]+/\//g;
993	# DB: want a pathname abc\de.html to remain like this
994	}
995
996	return $pathname;
997	}
998
999
1000	sub tidy_up_oid {
1001	my ($OID) = @_;
1002	if ($OID =~ /\./) {
1003	print STDERR "Warning, identifier $OID contains periods (.), removing them\n";
1004	$OID =~ s/\.//g; #remove any periods
1005	}
1006	if ($OID =~ /^\s.*\s$/) {
1007	print STDERR "Warning, identifier $OID starts or ends with whitespace. Removing it\n";
1008	# remove starting and trailing whitespace
1009	$OID =~ s/^\s+//;
1010	$OID =~ s/\s+$//;
1011	}
1012	if ($OID =~ /^[\d]*$/) {
1013	print STDERR "Warning, identifier $OID contains only digits. Prepending 'D'.\n";
1014	$OID = "D" . $OID;
1015	}
1016
1017	return $OID;
1018	}
1019
1020	sub envvar_prepend {
1021	my ($var,$val) = @_;
1022
1023	# 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
1024	my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
1025
1026	# do not prepend any value/path that's already in the environment variable
1027
1028	my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1029	if (!defined($ENV{$var})) {
1030	$ENV{$var} = "$val";
1031	}
1032	elsif($ENV{$var} !~ m/$escaped_val/) {
1033	$ENV{$var} = "$val".$pathsep.$ENV{$var};
1034	}
1035	}
1036
1037	sub envvar_append {
1038	my ($var,$val) = @_;
1039
1040	# 64 bit linux can't handle ";" as path separator, so make sure to set this to the right one for the OS
1041	my $pathsep = (defined $ENV{'GSDLOS'} && $ENV{'GSDLOS'} !~ m/windows/) ? ":" : ";";
1042
1043	# do not append any value/path that's already in the environment variable
1044
1045	my $escaped_val = &filename_to_regex($val); # escape any backslashes and brackets for upcoming regex
1046	if (!defined($ENV{$var})) {
1047	$ENV{$var} = "$val";
1048	}
1049	elsif($ENV{$var} !~ m/$escaped_val/) {
1050	$ENV{$var} = $ENV{$var}.$pathsep."$val";
1051	}
1052	}
1053
1054
1055	# splits a filename into a prefix and a tail extension using the tail_re, or
1056	# if that fails, splits on the file_extension . (dot)
1057	sub get_prefix_and_tail_by_regex {
1058
1059	my ($filename,$tail_re) = @_;
1060
1061	my ($file_prefix,$file_ext) = ($filename =~ m/^(.*?)($tail_re)$/);
1062	if ((!defined $file_prefix) \|\| (!defined $file_ext)) {
1063	($file_prefix,$file_ext) = ($filename =~ m/^(.)(\..?)$/);
1064	}
1065
1066	return ($file_prefix,$file_ext);
1067	}
1068
1069	# get full path and file only path from a base_dir (which may be empty) and
1070	# file (which may contain directories)
1071	sub get_full_filenames {
1072	my ($base_dir, $file) = @_;
1073
1074	my $filename_full_path = $file;
1075	# add on directory if present
1076	$filename_full_path = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\S/;
1077
1078	my $filename_no_path = $file;
1079
1080	# remove directory if present
1081	$filename_no_path =~ s/^.*[\/\\]//;
1082	return ($filename_full_path, $filename_no_path);
1083	}
1084
1085	# returns the path of a file without the filename -- ie. the directory the file is in
1086	sub filename_head {
1087	my $filename = shift(@_);
1088
1089	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1090	$filename =~ s/[^\\\\]*$//;
1091	}
1092	else {
1093	$filename =~ s/[^\\\/]*$//;
1094	}
1095
1096	return $filename;
1097	}
1098
1099
1100
1101	# returns 1 if filename1 and filename2 point to the same
1102	# file or directory
1103	sub filenames_equal {
1104	my ($filename1, $filename2) = @_;
1105
1106	# use filename_cat to clean up trailing slashes and
1107	# multiple slashes
1108	$filename1 = filename_cat ($filename1);
1109	$filename2 = filename_cat ($filename2);
1110
1111	# filenames not case sensitive on windows
1112	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1113	$filename1 =~ tr/[A-Z]/[a-z]/;
1114	$filename2 =~ tr/[A-Z]/[a-z]/;
1115	}
1116	return 1 if $filename1 eq $filename2;
1117	return 0;
1118	}
1119
1120	# If filename is relative to within_dir, returns the relative path of filename to that directory
1121	# with slashes in the filename returned as they were in the original (absolute) filename.
1122	sub filename_within_directory
1123	{
1124	my ($filename,$within_dir) = @_;
1125
1126	if ($within_dir !~ m/[\/\\]$/) {
1127	my $dirsep = &util::get_dirsep();
1128	$within_dir .= $dirsep;
1129	}
1130
1131	$within_dir = &filename_to_regex($within_dir); # escape DOS style file separator and brackets
1132	if ($filename =~ m/^$within_dir(.*)$/) {
1133	$filename = $1;
1134	}
1135
1136	return $filename;
1137	}
1138
1139	# If filename is relative to within_dir, returns the relative path of filename to that directory in URL format.
1140	# Filename and within_dir can be any type of slashes, but will be compared as URLs (i.e. unix-style slashes).
1141	# The subpath returned will also be a URL type filename.
1142	sub filename_within_directory_url_format
1143	{
1144	my ($filename,$within_dir) = @_;
1145
1146	# convert parameters only to / slashes if Windows
1147
1148	my $filename_urlformat = &filepath_to_url_format($filename);
1149	my $within_dir_urlformat = &filepath_to_url_format($within_dir);
1150
1151	#if ($within_dir_urlformat !~ m/\/$/) {
1152	# make sure directory ends with a slash
1153	#$within_dir_urlformat .= "/";
1154	#}
1155
1156	my $within_dir_urlformat_re = &filename_to_regex($within_dir_urlformat); # escape any special RE characters, such as brackets
1157
1158	#print STDERR "@@@@@ $filename_urlformat =~ $within_dir_urlformat_re\n";
1159
1160	# dir prefix may or may not end with a slash (this is discarded when extracting the sub-filepath)
1161	if ($filename_urlformat =~ m/^$within_dir_urlformat_re(?:\/)(.)$/) {
1162	$filename_urlformat = $1;
1163	}
1164
1165	return $filename_urlformat;
1166	}
1167
1168	# Convert parameter to use / slashes if Windows (if on Linux leave any \ as is,
1169	# since on Linux it doesn't represent a file separator but an escape char).
1170	sub filepath_to_url_format
1171	{
1172	my ($filepath) = @_;
1173	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1174	# Only need to worry about Windows, as Unix style directories already in url-format
1175	# Convert Windows style \ => /
1176	$filepath =~ s@\\@/@g;
1177	}
1178	return $filepath;
1179	}
1180
1181	# regex filepaths on windows may include \\ as path separator. Convert \\ to /
1182	sub filepath_regex_to_url_format
1183	{
1184	my ($filepath) = @_;
1185	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1186	# Only need to worry about Windows, as Unix style directories already in url-format
1187	# Convert Windows style \\ => /
1188	$filepath =~ s@\\\\@/@g;
1189	}
1190	return $filepath;
1191
1192	}
1193
1194	# Like File::Basename::fileparse, but expects filepath in url format (ie only / slash for dirsep)
1195	# and ignores trailing /
1196	# returns (file, dirs) dirs will be empty if no subdirs
1197	sub url_fileparse
1198	{
1199	my ($filepath) = @_;
1200	# remove trailing /
1201	$filepath =~ s@/$@@;
1202	if ($filepath !~ m@/@) {
1203	return ($filepath, "");
1204	}
1205	my ($dirs, $file) = $filepath =~ m@(.+/)([^/]+)@;
1206	return ($file, $dirs);
1207
1208	}
1209
1210
1211	sub filename_within_collection
1212	{
1213	my ($filename) = @_;
1214
1215	my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
1216
1217	if (defined $collect_dir) {
1218
1219	# if from within GSDLCOLLECTDIR, then remove directory prefix
1220	# so source_filename is realative to it. This is done to aid
1221	# portability, i.e. the collection can be moved to somewhere
1222	# else on the file system and the archives directory will still
1223	# work. This is needed, for example in the applet version of
1224	# GLI where GSDLHOME/collect on the server will be different to
1225	# the collect directory of the remove user. Of course,
1226	# GSDLCOLLECTDIR subsequently needs to be put back on to turn
1227	# it back into a full pathname.
1228
1229	$filename = filename_within_directory($filename,$collect_dir);
1230	}
1231
1232	return $filename;
1233	}
1234
1235	sub prettyprint_file
1236	{
1237	my ($base_dir,$file,$gli) = @_;
1238
1239	my $filename_full_path = &util::filename_cat($base_dir,$file);
1240
1241	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1242	require Win32;
1243
1244	# For some reason base_dir in the form c:/a/b/c
1245	# This leads to confusion later on, so turn it back into
1246	# the more usual Windows form
1247	$base_dir =~ s/\//\\/g;
1248	my $long_base_dir = Win32::GetLongPathName($base_dir);
1249	my $long_full_path = Win32::GetLongPathName($filename_full_path);
1250
1251	$file = filename_within_directory($long_full_path,$long_base_dir);
1252	$file = encode("utf8",$file) if ($gli);
1253	}
1254
1255	return $file;
1256	}
1257
1258
1259	sub upgrade_if_dos_filename
1260	{
1261	my ($filename_full_path,$and_encode) = @_;
1262
1263	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1264	# Ensure any DOS-like filename, such as test~1.txt, has been upgraded
1265	# to its long (Windows) version
1266	my $long_filename = Win32::GetLongPathName($filename_full_path);
1267	if (defined $long_filename) {
1268	$filename_full_path = $long_filename;
1269	}
1270	# Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
1271	$filename_full_path =~ s/^(.):/\u$1:/;
1272	if ((defined $and_encode) && ($and_encode)) {
1273	$filename_full_path = encode("utf8",$filename_full_path);
1274	}
1275	}
1276
1277	return $filename_full_path;
1278	}
1279
1280
1281	sub downgrade_if_dos_filename
1282	{
1283	my ($filename_full_path) = @_;
1284
1285	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1286	require Win32;
1287
1288	# Ensure the given long Windows filename is in a form that can
1289	# be opened by Perl => convert it to a short DOS-like filename
1290
1291	my $short_filename = Win32::GetShortPathName($filename_full_path);
1292	if (defined $short_filename) {
1293	$filename_full_path = $short_filename;
1294	}
1295	# Make sure initial drive letter is lower-case (to fit in
1296	# with rest of Greenstone)
1297	$filename_full_path =~ s/^(.):/\u$1:/;
1298	}
1299
1300	return $filename_full_path;
1301	}
1302
1303	sub block_filename
1304	{
1305	my ($block_hash,$filename) = @_;
1306
1307	if ($ENV{'GSDLOS'} =~ m/^windows$/) {
1308
1309	# lower case the entire thing, eg for cover.jpg when its actually cover.JPG
1310	my $lower_filename = lc($filename);
1311	$block_hash->{'file_blocks'}->{$lower_filename} = 1;
1312	# my $lower_drive = $filename;
1313	# $lower_drive =~ s/^([A-Z]):/\l$1:/i;
1314
1315	# my $upper_drive = $filename;
1316	# $upper_drive =~ s/^([A-Z]):/\u$1:/i;
1317	#
1318	# $block_hash->{'file_blocks'}->{$lower_drive} = 1;
1319	# $block_hash->{'file_blocks'}->{$upper_drive} = 1;
1320	}
1321	else {
1322	$block_hash->{'file_blocks'}->{$filename} = 1;
1323	}
1324	}
1325
1326
1327	sub filename_is_absolute
1328	{
1329	my ($filename) = @_;
1330
1331	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1332	return ($filename =~ m/^(\w:)?\\/);
1333	}
1334	else {
1335	return ($filename =~ m/^\//);
1336	}
1337	}
1338
1339
1340	## @method make_absolute()
1341	#
1342	# Ensure the given file path is absolute in respect to the given base path.
1343	#
1344	# @param $base_dir A string denoting the base path the given dir must be
1345	# absolute to.
1346	# @param $dir The directory to be made absolute as a string. Note that the
1347	# dir may already be absolute, in which case it will remain
1348	# unchanged.
1349	# @return The now absolute form of the directory as a string.
1350	#
1351	# @author John Thompson, DL Consulting Ltd.
1352	# @copy 2006 DL Consulting Ltd.
1353	#
1354	#used in buildcol.pl, doesn't work for all cases --kjdon
1355	sub make_absolute {
1356
1357	my ($base_dir, $dir) = @_;
1358	### print STDERR "dir = $dir\n";
1359	$dir =~ s/[\\\/]+/\//g;
1360	$dir = $base_dir . "/$dir" unless ($dir =~ m\|^(\w:)?/\|);
1361	$dir =~ s\|^/tmp_mnt\|\|;
1362	1 while($dir =~ s\|/[^/]*/\.\./\|/\|g);
1363	$dir =~ s\|/[.][.]?/\|/\|g;
1364	$dir =~ tr\|/\|/\|s;
1365	### print STDERR "dir = $dir\n";
1366
1367	return $dir;
1368	}
1369	## make_absolute() ##
1370
1371	sub get_dirsep {
1372
1373	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1374	return "\\";
1375	} else {
1376	return "\/";
1377	}
1378	}
1379
1380	sub get_os_dirsep {
1381
1382	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
1383	return "\\\\";
1384	} else {
1385	return "\\\/";
1386	}
1387	}
1388
1389	sub get_re_dirsep {
1390
1391	return "\\\\\|\\\/";
1392	}
1393
1394
1395	sub get_dirsep_tail {
1396	my ($filename) = @_;
1397
1398	# returns last part of directory or filename
1399	# On unix e.g. a/b.d => b.d
1400	# a/b/c => c
1401
1402	my $dirsep = get_re_dirsep();
1403	my @dirs = split (/$dirsep/, $filename);
1404	my $tail = pop @dirs;
1405
1406	# - caused problems under windows
1407	#my ($tail) = ($filename =~ m/^(?:.?$dirsep)?(.?)$/);
1408
1409	return $tail;
1410	}
1411
1412
1413	# if this is running on windows we want binaries to end in
1414	# .exe, otherwise they don't have to end in any extension
1415	sub get_os_exe {
1416	return ".exe" if $ENV{'GSDLOS'} =~ /^windows$/i;
1417	return "";
1418	}
1419
1420
1421	# test to see whether this is a big or little endian machine
1422	sub is_little_endian
1423	{
1424	# To determine the name of the operating system, the variable $^O is a cheap alternative to pulling it out of the Config module;
1425	# If it is a Macintosh machine (i.e. the Darwin operating system), regardless if it's running on the IBM power-pc cpu or the x86 Intel-based chip with a power-pc emulator running on top of it, it's big-endian
1426	# Otherwise, it's little endian
1427
1428	#return 0 if $^O =~ /^darwin$/i;
1429	#return 0 if $ENV{'GSDLOS'} =~ /^darwin$/i;
1430
1431	# Going back to stating exactly whether the machine is little endian
1432	# or big endian, without any special case for Macs. Since for rata it comes
1433	# back with little endian and for shuttle with bigendian.
1434	return (ord(substr(pack("s",1), 0, 1)) == 1);
1435	}
1436
1437
1438	# will return the collection name if successful, "" otherwise
1439	sub use_collection {
1440	my ($collection, $collectdir) = @_;
1441
1442	if (!defined $collectdir \|\| $collectdir eq "") {
1443	$collectdir = &filename_cat ($ENV{'GSDLHOME'}, "collect");
1444	}
1445
1446	# get and check the collection
1447	if (!defined($collection) \|\| $collection eq "") {
1448	if (defined $ENV{'GSDLCOLLECTION'}) {
1449	$collection = $ENV{'GSDLCOLLECTION'};
1450	} else {
1451	print STDOUT "No collection specified\n";
1452	return "";
1453	}
1454	}
1455
1456	if ($collection eq "modelcol") {
1457	print STDOUT "You can't use modelcol.\n";
1458	return "";
1459	}
1460
1461	# make sure the environment variables GSDLCOLLECTION and GSDLCOLLECTDIR
1462	# are defined
1463	$ENV{'GSDLCOLLECTION'} = $collection;
1464	$ENV{'GSDLCOLLECTDIR'} = &filename_cat ($collectdir, $collection);
1465
1466	# make sure this collection exists
1467	if (!-e $ENV{'GSDLCOLLECTDIR'}) {
1468	print STDOUT "Invalid collection ($collection).\n";
1469	return "";
1470	}
1471
1472	# everything is ready to go
1473	return $collection;
1474	}
1475
1476	sub get_current_collection_name {
1477	return $ENV{'GSDLCOLLECTION'};
1478	}
1479
1480
1481	# will return the collection name if successful, "" otherwise.
1482	# Like use_collection (above) but for greenstone 3 (taking account of site level)
1483
1484	sub use_site_collection {
1485	my ($site, $collection, $collectdir) = @_;
1486
1487	if (!defined $collectdir \|\| $collectdir eq "") {
1488	die "GSDL3HOME not set.\n" unless defined $ENV{'GSDL3HOME'};
1489	$collectdir = &filename_cat ($ENV{'GSDL3HOME'}, "sites", $site, "collect");
1490	}
1491
1492	# collectdir explicitly set by this point (using $site variable if required).
1493	# Can call "old" gsdl2 use_collection now.
1494
1495	return use_collection($collection,$collectdir);
1496	}
1497
1498
1499
1500	sub locate_config_file
1501	{
1502	my ($file) = @_;
1503
1504	my $locations = locate_config_files($file);
1505
1506	return shift @$locations; # returns undef if 'locations' is empty
1507	}
1508
1509
1510	sub locate_config_files
1511	{
1512	my ($file) = @_;
1513
1514	my @locations = ();
1515
1516	if (-e $file) {
1517	# Clearly specified (most likely full filename)
1518	# No need to hunt in 'etc' directories, return value unchanged
1519	push(@locations,$file);
1520	}
1521	else {
1522	# Check for collection specific one before looking in global GSDL 'etc'
1523	if (defined $ENV{'GSDLCOLLECTDIR'} && $ENV{'GSDLCOLLECTDIR'} ne "") {
1524	my $test_collect_etc_filename
1525	= &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"etc", $file);
1526
1527	if (-e $test_collect_etc_filename) {
1528	push(@locations,$test_collect_etc_filename);
1529	}
1530	}
1531	my $test_main_etc_filename
1532	= &util::filename_cat($ENV{'GSDLHOME'},"etc", $file);
1533	if (-e $test_main_etc_filename) {
1534	push(@locations,$test_main_etc_filename);
1535	}
1536	}
1537
1538	return \@locations;
1539	}
1540
1541
1542	sub hyperlink_text
1543	{
1544	my ($text) = @_;
1545
1546	$text =~ s/(http:\/\/[^\s]+)/<a href=\"$1\">$1<\/a>/mg;
1547	$text =~ s/(^\|\s+)(www\.(\w\|\.)+)/<a href=\"http:\/\/$2\">$2<\/a>/mg;
1548
1549	return $text;
1550	}
1551
1552
1553	# A method to check if a directory is empty (note that an empty directory still has non-zero size!!!)
1554	# Code is from http://episteme.arstechnica.com/eve/forums/a/tpc/f/6330927813/m/436007700831
1555	sub is_dir_empty
1556	{
1557	my ($path) = @_;
1558	opendir DIR, $path;
1559	while(my $entry = readdir DIR) {
1560	next if($entry =~ /^\.\.?$/);
1561	closedir DIR;
1562	return 0;
1563	}
1564	closedir DIR;
1565	return 1;
1566	}
1567
1568	# Returns the given filename converted using either URL encoding or base64
1569	# encoding, as specified by $rename_method. If the given filename has no suffix
1570	# (if it is just the tailname), then $no_suffix should be some defined value.
1571	# rename_method can be url, none, base64
1572	sub rename_file {
1573	my ($filename, $rename_method, $no_suffix) = @_;
1574
1575	if(!$filename) { # undefined or empty string
1576	return $filename;
1577	}
1578
1579	if (!$rename_method) {
1580	print STDERR "WARNING: no file renaming method specified. Defaulting to using URL encoding...\n";
1581	# Debugging information
1582	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1583	# print STDERR "Called from method: $cfilename:$cline $cpackage->$csubr\n";
1584	$rename_method = "url";
1585	} elsif($rename_method eq "none") {
1586	return $filename; # would have already been renamed
1587	}
1588
1589	# No longer replace spaces with underscores, since underscores mess with incremental rebuild
1590	### Replace spaces with underscore. Do this first else it can go wrong below when getting tailname
1591	###$filename =~ s/ /_/g;
1592
1593	my ($tailname,$dirname,$suffix);
1594	if($no_suffix) { # given a tailname, no suffix
1595	($tailname,$dirname) = File::Basename::fileparse($filename);
1596	}
1597	else {
1598	($tailname,$dirname,$suffix) = File::Basename::fileparse($filename, "\\.(?:[^\\.]+?)\$");
1599	}
1600	if (!$suffix) {
1601	$suffix = "";
1602	}
1603	# This breaks GLI matching extracted metadata to files in Enrich panel, as
1604	# original is eg .JPG while gsdlsourcefilename ends up .jpg
1605	# Not sure why it was done in first place...
1606	#else {
1607	#$suffix = lc($suffix);
1608	#}
1609
1610	if ($rename_method eq "url") {
1611	$tailname = &unicode::url_encode($tailname);
1612	}
1613	elsif ($rename_method eq "base64") {
1614	$tailname = &unicode::base64_encode($tailname);
1615	$tailname =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
1616	}
1617
1618	$filename = "$tailname$suffix";
1619	$filename = "$dirname$filename" if ($dirname ne "./" && $dirname ne ".\\");
1620
1621	return $filename;
1622	}
1623
1624
1625	# BACKWARDS COMPATIBILITY: Just in case there are old .ldb/.bdb files
1626	sub rename_ldb_or_bdb_file {
1627	my ($filename_no_ext) = @_;
1628
1629	my $new_filename = "$filename_no_ext.gdb";
1630	return if (-f $new_filename); # if the file has the right extension, don't need to do anything
1631	# try ldb
1632	my $old_filename = "$filename_no_ext.ldb";
1633
1634	if (-f $old_filename) {
1635	print STDERR "Renaming $old_filename to $new_filename\n";
1636	rename ($old_filename, $new_filename)
1637	\|\| print STDERR "Rename failed: $!\n";
1638	return;
1639	}
1640	# try bdb
1641	$old_filename = "$filename_no_ext.bdb";
1642	if (-f $old_filename) {
1643	print STDERR "Renaming $old_filename to $new_filename\n";
1644	rename ($old_filename, $new_filename)
1645	\|\| print STDERR "Rename failed: $!\n";
1646	return;
1647	}
1648	}
1649
1650	sub os_dir() {
1651
1652	my $gsdlarch = "";
1653	if(defined $ENV{'GSDLARCH'}) {
1654	$gsdlarch = $ENV{'GSDLARCH'};
1655	}
1656	return $ENV{'GSDLOS'}.$gsdlarch;
1657	}
1658
1659	# Returns the greenstone URL prefix extracted from the appropriate GS2/GS3 config file.
1660	# By default, /greenstone3 for GS3 or /greenstone for GS2.
1661	sub get_greenstone_url_prefix() {
1662	# if already set on a previous occasion, just return that
1663	# (Don't want to keep repeating this: cost of re-opening and scanning files.)
1664	return $ENV{'GREENSTONE_URL_PREFIX'} if($ENV{'GREENSTONE_URL_PREFIX'});
1665
1666	my ($configfile, $urlprefix, $defaultUrlprefix);
1667	my @propertynames = ();
1668
1669	if($ENV{'GSDL3SRCHOME'}) {
1670	$defaultUrlprefix = "/greenstone3";
1671	$configfile = &util::filename_cat($ENV{'GSDL3SRCHOME'}, "packages", "tomcat", "conf", "Catalina", "localhost", "greenstone3.xml");
1672	push(@propertynames, qw/path\s*\=/);
1673	} else {
1674	$defaultUrlprefix = "/greenstone";
1675	$configfile = &util::filename_cat($ENV{'GSDLHOME'}, "cgi-bin", &os_dir(), "gsdlsite.cfg");
1676	push(@propertynames, (qw/\nhttpprefix/, qw/\ngwcgi/)); # inspect one property then the other
1677	}
1678
1679	$urlprefix = &extract_propvalue_from_file($configfile, \@propertynames);
1680
1681	if(!$urlprefix) { # no values found for URL prefix, use default values
1682	$urlprefix = $defaultUrlprefix;
1683	} else {
1684	#gwcgi can contain more than the wanted prefix, we split on / to get the first "directory" level
1685	$urlprefix =~ s/^\///; # remove the starting slash
1686	my @dirs = split(/(\\\|\/)/, $urlprefix);
1687	$urlprefix = shift(@dirs);
1688
1689	if($urlprefix !~ m/^\//) { # in all cases: ensure the required forward slash is at the front
1690	$urlprefix = "/$urlprefix";
1691	}
1692	}
1693
1694	# set for the future
1695	$ENV{'GREENSTONE_URL_PREFIX'} = $urlprefix;
1696	# print STDERR "*** in get_greenstone_url_prefix(): $urlprefix\n\n";
1697	return $urlprefix;
1698	}
1699
1700
1701	# Given a config file (xml or java properties file) and a list/array of regular expressions
1702	# that represent property names to match on, this function will return the value for the 1st
1703	# matching property name. If the return value is undefined, no matching property was found.
1704	sub extract_propvalue_from_file() {
1705	my ($configfile, $propertynames) = @_;
1706
1707	my $value;
1708	unless(open(FIN, "<$configfile")) {
1709	print STDERR "extract_propvalue_from_file(): Unable to open $configfile. $!\n";
1710	return $value; # not initialised
1711	}
1712
1713	# Read the entire file at once, as one single line, then close it
1714	my $filecontents;
1715	{
1716	local $/ = undef;
1717	$filecontents = <FIN>;
1718	}
1719	close(FIN);
1720
1721	foreach my $regex (@$propertynames) {
1722	($value) = $filecontents=~ m/$regex\s(\S)/s; # read value of the property given by regex up to the 1st space
1723	if($value) {
1724	$value =~ s/^\"//; # remove any startquotes
1725	$value =~ s/\".*$//; # remove the 1st endquotes (if any) followed by any xml
1726	last; # found value for a matching property, break from loop
1727	}
1728	}
1729
1730	return $value;
1731	}
1732
1733	# Subroutine that sources setup.bash, given GSDLHOME and GSDLOS and
1734	# given that perllib is in @INC in order to invoke this subroutine.
1735	# Call as follows -- after setting up INC to include perllib and
1736	# after setting up GSDLHOME and GSDLOS:
1737	#
1738	# require util;
1739	# &util::setup_greenstone_env($ENV{'GSDLHOME'}, $ENV{'GSDLOS'});
1740	#
1741	sub setup_greenstone_env() {
1742	my ($GSDLHOME, $GSDLOS) = @_;
1743
1744	#my %env_map = ();
1745	# Get the localised ENV settings of running a localised source setup.bash
1746	# and put it into the ENV here. Need to clear GSDLHOME before running setup
1747	#my $perl_command = "(cd $GSDLHOME; export GSDLHOME=; . ./setup.bash > /dev/null; env)";
1748	my $perl_command = "(cd $GSDLHOME; /bin/bash -c \"export GSDLHOME=; source setup.bash > /dev/null; env\")";
1749	if($GSDLOS =~ m/windows/i) {
1750	#$perl_command = "cmd /C \"cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set\"";
1751	$perl_command = "(cd $GSDLHOME&& set GSDLHOME=&& setup.bat > nul&& set)";
1752	}
1753	if (!open(PIN, "$perl_command \|")) {
1754	print STDERR ("Unable to execute command: $perl_command. $!\n");
1755	}
1756
1757	while (defined (my $perl_output_line = <PIN>)) {
1758	my($key,$value) = ($perl_output_line =~ m/^([^=])[=](.)$/);
1759	#$env_map{$key}=$value;
1760	$ENV{$key}=$value;
1761	}
1762	close (PIN);
1763
1764	# If any keys in $ENV don't occur in Greenstone's localised env
1765	# (stored in $env_map), delete those entries from $ENV
1766	#foreach $key (keys %ENV) {
1767	# if(!defined $env_map{$key}) {
1768	# print STDOUT "**** DELETING ENV KEY: $key\tVALUE: $ENV{'$key'}\n";
1769	# delete $ENV{$key}; # del $ENV(key, value) pair
1770	# }
1771	#}
1772	#undef %env_map;
1773	}
1774
1775	sub get_perl_exec() {
1776	my $perl_exec = $^X; # may return just "perl"
1777
1778	if($ENV{'PERLPATH'}) {
1779	# OR: # $perl_exec = &util::filename_cat($ENV{'PERLPATH'},"perl");
1780	if($ENV{'GSDLOS'} =~ m/windows/) {
1781	$perl_exec = "$ENV{'PERLPATH'}\\Perl.exe";
1782	} else {
1783	$perl_exec = "$ENV{'PERLPATH'}/perl";
1784	}
1785	} else { # no PERLPATH, use Config{perlpath} else $^X: special variables
1786	# containing the full path to the current perl executable we're using
1787	$perl_exec = $Config{perlpath}; # configured path for perl
1788	if (!-e $perl_exec) { # may not point to location on this machine
1789	$perl_exec = $^X; # may return just "perl"
1790	if($perl_exec =~ m/^perl/i) { # warn if just perl or Perl.exe
1791	print STDERR "**** WARNING: Perl exec found contains no path: $perl_exec";
1792	}
1793	}
1794	}
1795
1796	return $perl_exec;
1797	}
1798
1799	# returns the path to the java command in the JRE included with GS (if any),
1800	# quoted to safeguard any spaces in this path, otherwise a simple java
1801	# command is returned which assumes and will try for a system java.
1802	sub get_java_command {
1803	my $java = "java";
1804	if(defined $ENV{'GSDLHOME'}) { # should be, as this script would be launched from the cmd line
1805	# after running setup.bat or from GLI which also runs setup.bat
1806	my $java_bin = &util::filename_cat($ENV{'GSDLHOME'},"packages","jre","bin");
1807	if(-d $java_bin) {
1808	$java = &util::filename_cat($java_bin,"java");
1809	$java = "\"".$java."\""; # quoted to preserve spaces in path
1810	}
1811	}
1812	return $java;
1813	}
1814
1815
1816	# Given the qualified collection name (colgroup/collection),
1817	# returns the collection and colgroup parts
1818	sub get_collection_parts {
1819	# http://perldoc.perl.org/File/Basename.html
1820	# my($filename, $directories, $suffix) = fileparse($path);
1821	# "$directories contains everything up to and including the last directory separator in the $path
1822	# including the volume (if applicable). The remainder of the $path is the $filename."
1823	#my ($collection, $colgroup) = &File::Basename::fileparse($qualified_collection);
1824
1825	my $qualified_collection = shift(@_);
1826
1827	# Since activate.pl can be launched from the command-line, including by a user,
1828	# best not to assume colgroup uses URL-style slashes as would be the case with GLI
1829	# Also allow for the accidental inclusion of multiple slashes
1830	my ($colgroup, $collection) = split(/[\/\\]+/, $qualified_collection); #split('/', $qualified_collection);
1831
1832	if(!defined $collection) {
1833	$collection = $colgroup;
1834	$colgroup = "";
1835	}
1836	return ($collection, $colgroup);
1837	}
1838
1839	# work out the "collectdir/collection" location
1840	sub resolve_collection_dir {
1841	my ($collect_dir, $qualified_collection, $site) = @_; #, $gs_mode
1842
1843	my ($colgroup, $collection) = &util::get_collection_parts($qualified_collection);
1844
1845	if (defined $collect_dir) {
1846	return &util::filename_cat($collect_dir,$colgroup, $collection);
1847	}
1848	elsif (defined($ENV{'GSDLCOLLECTDIR'})) {
1849	return $ENV{'GSDLCOLLECTDIR'};
1850	}
1851	else {
1852	if (defined $site) {
1853	return &util::filename_cat($ENV{'GSDL3HOME'},"sites",$site,"collect",$colgroup, $collection);
1854	}
1855	else {
1856	return &util::filename_cat($ENV{'GSDLHOME'},"collect",$colgroup, $collection);
1857	}
1858	}
1859	}
1860
1861	# Used by pdfpstoimg.pl and PDFBoxConverter to create a .item file from
1862	# a directory containing sequentially numbered images.
1863	sub create_itemfile
1864	{
1865	my ($output_dir, $convert_basename, $convert_to) = @_;
1866	opendir(DIR, $output_dir) \|\| die "can't opendir $output_dir: $!";
1867
1868	my $page_num = "";
1869	my @dir_files = grep {-f "$output_dir/$_"} readdir(DIR);
1870
1871	# Sort files in the directory by page_num
1872	sub page_number {
1873	my ($dir) = @_;
1874	my ($pagenum) =($dir =~ m/^.*[-\.]?(\d+)(\.(jpg\|gif\|png))?$/i);
1875
1876	$pagenum = 1 unless defined $pagenum;
1877	return $pagenum;
1878	}
1879
1880	# sort the files in the directory in the order of page_num rather than lexically.
1881	@dir_files = sort { page_number($a) <=> page_number($b) } @dir_files;
1882
1883	# work out if the numbering of the now sorted image files starts at 0 or not
1884	# by checking the number of the first _image_ file (skipping item files)
1885	my $starts_at_0 = 0;
1886	my $firstfile = ($dir_files[0] !~ /\.item$/i) ? $dir_files[0] : $dir_files[1];
1887	if(page_number($firstfile) == 0) { # 00 will evaluate to 0 too in this condition
1888	$starts_at_0 = 1;
1889	}
1890
1891	my $item_file = &util::filename_cat($output_dir, $convert_basename.".item");
1892	open(FILE,">$item_file");
1893	print FILE "<PagedDocument>\n";
1894
1895	foreach my $file (@dir_files){
1896	if ($file !~ /\.item/i){
1897	$page_num = page_number($file);
1898	$page_num++ if $starts_at_0; # image numbers start at 0, so add 1
1899	print FILE " <Page pagenum=\"$page_num\" imgfile=\"$file\" txtfile=\"\"/>\n";
1900	}
1901	}
1902
1903	print FILE "</PagedDocument>\n";
1904	close FILE;
1905	closedir DIR;
1906	return $item_file;
1907	}
1908
1909	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: