########################################################################### # # ParallelInexport.pm -- useful class to support parallel_import.pl # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package ParallelInexport; use strict; # Randomize the order of files in the filelist use List::Util qw( shuffle ); # index the files in parallel using MPI farmer to farm off multiple processes # [hs, 1 july 2010] sub farm_out_processes { my ($jobs, $epoch, $importdir, $block_hash, $collection, $site) = @_; my $tmp_dir_path = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 'tmp'); if (!-d $tmp_dir_path) { mkdir($tmp_dir_path, 0777); } # create the list of files to import my $overwrite = 1; my $tmp_filelist = &util::filename_cat($tmp_dir_path, "filelist.txt"); # - if the file is already there (which is should be during testing) then # don't regenerate. This is especially important for imports of 1 million # documents as just the directory scan can take several hours. if ($overwrite || !-f $tmp_filelist) { open (my $filelist, ">$tmp_filelist"); my @filenames = keys %{$block_hash->{'all_files'}}; @filenames = shuffle(@filenames); foreach my $filename (@filenames) { my $full_filename = &util::filename_cat($importdir,$filename); if ((! exists $block_hash->{'file_blocks'}->{$full_filename}) && ($filename !~ m/metadata\.xml$/)) { print $filelist "$filename\n"; } } close ($filelist); } # Determine if we've been provided a mpi.conf file to indicate the other # machines (slave nodes) this parallizable process should run on my $mpi_conf_path = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 'mpi.conf'); my $mpi_flags = ''; if (-f $mpi_conf_path) { print STDERR " ***** CLUSTER MODE *****\n"; $mpi_flags .= '-machinefile "' . $mpi_conf_path . '" '; #$mpi_flags .= '-nolocal '; } else { print STDERR " ***** SINGLE COMPUTER MODE *****\n"; } $mpi_flags .= ' --show-progress --timestamp-output --verbose'; # invoke the farmer to start processing the files my $gsdlhome; if (defined $site) { $gsdlhome = $ENV{'GSDL3HOME'}; } else { $site = ""; $gsdlhome = $ENV{'GSDLHOME'}; } # my $farmer_exe = $gsdlhome . '/ext/parallel-building/linux/bin/mpiimport'; my $farmer_exe = 'mpiimport'; # my $mpi_cmd = $gsdlhome . '/ext/parallel-building/linux/bin/mpirun ' . $mpi_flags . ' -n ' . $jobs . ' ' . $farmer_exe . ' ' . $tmp_filelist . ' ' . $epoch . ' ' . $gsdlhome . ' ' . $collection . ' ' . $site; my $mpi_cmd = 'mpirun ' . $mpi_flags . ' -n ' . $jobs . ' ' . $farmer_exe . ' ' . $tmp_filelist . ' ' . $epoch . ' ' . $gsdlhome . ' ' . $collection . ' ' . $site; print STDERR "MPI Command: \"" . $mpi_cmd . "\"\n"; # system ($mpi_cmd); open(MPI, $mpi_cmd . " |") or die("Couldn't Execute MPI"); while ( defined( my $line = ) ) { chomp($line); print "$line\n"; } close(MPI); } 1;