source: gs2-extensions/parallel-building/trunk/src/perllib/ParallelInexport.pm@ 26072

Last change on this file since 26072 was 26072, checked in by jmt12, 12 years ago

Extended to notice and load mpi.conf file on clusters and to use full paths to executables since they don't always seem to be on environment PATH correctly

File size: 3.6 KB
Line 
1###########################################################################
2#
3# ParallelInexport.pm -- useful class to support parallel_import.pl
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package ParallelInexport;
27
28use strict;
29
30# Randomize the order of files in the filelist
31use List::Util qw( shuffle );
32
33# index the files in parallel using MPI farmer to farm off multiple processes
34# [hs, 1 july 2010]
35sub farm_out_processes
36{
37 my ($jobs, $epoch, $importdir, $block_hash, $collection, $site) = @_;
38
39 my $tmp_dir_path = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 'tmp');
40 if (!-d $tmp_dir_path)
41 {
42 mkdir($tmp_dir_path, 0777);
43 }
44
45 # create the list of files to import
46 my $overwrite = 1;
47 my $tmp_filelist = &util::filename_cat($tmp_dir_path, "filelist.txt");
48 # - if the file is already there (which is should be during testing) then
49 # don't regenerate. This is especially important for imports of 1 million
50 # documents as just the directory scan can take several hours.
51 if ($overwrite || !-f $tmp_filelist)
52 {
53 open (my $filelist, ">$tmp_filelist");
54 my @filenames = keys %{$block_hash->{'all_files'}};
55 @filenames = shuffle(@filenames);
56 foreach my $filename (@filenames)
57 {
58 my $full_filename = &util::filename_cat($importdir,$filename);
59 if ((! exists $block_hash->{'file_blocks'}->{$full_filename})
60 && ($filename !~ m/metadata\.xml$/))
61 {
62 print $filelist "$filename\n";
63 }
64 }
65 close ($filelist);
66 }
67
68 # Determine if we've been provided a mpi.conf file to indicate the other
69 # machines (slave nodes) this parallizable process should run on
70 my $mpi_conf_path = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 'mpi.conf');
71 my $mpi_flags = '';
72 if (-f $mpi_conf_path)
73 {
74 print STDERR " ***** CLUSTER MODE *****\n";
75 $mpi_flags .= '-machinefile "' . $mpi_conf_path . '" ';
76 #$mpi_flags .= '-nolocal ';
77 }
78 else
79 {
80 print STDERR " ***** SINGLE COMPUTER MODE *****\n";
81 }
82 $mpi_flags .= ' --show-progress --timestamp-output --verbose --report-bindings --tag-output';
83
84 # invoke the farmer to start processing the files
85 $site = "" if (!defined $site);
86 my $gsdlhome = $ENV{'GSDLHOME'};
87 my $farmer_exe = $gsdlhome . '/ext/parallel-building/linux/bin/mpiimport';
88 my $mpi_cmd = $gsdlhome . '/ext/parallel-building/linux/bin/mpirun ' . $mpi_flags . ' -n ' . $jobs . ' ' . $farmer_exe . ' ' . $tmp_filelist . ' ' . $epoch . ' ' . $gsdlhome . ' ' . $collection . ' ' . $site;
89 print STDERR "MPI Command: \"" . $mpi_cmd . "\"\n";
90# system ($mpi_cmd);
91 open(MPI, $mpi_cmd . " |") or die("Couldn't Execute MPI");
92 while ( defined( my $line = <MPI> ) )
93 {
94 chomp($line);
95 print "$line\n";
96 }
97 close(MPI);
98}
99
1001;
Note: See TracBrowser for help on using the repository browser.