source: gs2-extensions/parallel-building/trunk/src/bin/script/parallel_dspace_filtermedia.pl@ 26999

Last change on this file since 26999 was 26999, checked in by jmt12, 11 years ago

Ensuring MPI binds to correct interface, and passing through environment variable (GSDLHOME) to compute nodes

  • Property svn:executable set to *
File size: 6.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# Enhance the DSpace media filter component with parallel processing
6# capability. Currently only processes media of dc.type "Video" (which will
7# be manually assigned to the *.TS files produced by ReplayMe!)
8#
9# Copyright (C) 2012 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package parallel_dspace_filtermedia;
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
36
37 if (defined $ENV{'GSDLEXTS'}) {
38 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
39 foreach my $e (@extensions) {
40 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
41
42 unshift (@INC, "$ext_prefix/perllib");
43 unshift (@INC, "$ext_prefix/perllib/cpan");
44 unshift (@INC, "$ext_prefix/perllib/plugins");
45 unshift (@INC, "$ext_prefix/perllib/plugouts");
46 }
47 }
48}
49
50use strict;
51
52sub printUsage
53{
54 my ($message) = @_;
55 if (defined $message)
56 {
57 print STDERR 'Error! ' . $message . "\n";
58 }
59 print STDERR 'Usage: parallel_dspace_filtermedia.pl -workers <int> -dspacehome <path> [-debug]' . "\n\n";
60 exit(0);
61}
62
63sub main
64{
65 print 'Parallel Media Filter started: ' . `date` . "\n";
66
67 # 1. Initialization
68 my $dspace_home = '';
69 my $worker_count = 0;
70 my $debug = 0;
71 my $i;
72 my $argument;
73 for ($i = 0; $i < scalar(@ARGV); $i++)
74 {
75 $argument = $ARGV[$i];
76 if ('-dspacehome' eq $argument)
77 {
78 $i++;
79 $dspace_home = $ARGV[$i];
80 if (!-d $dspace_home)
81 {
82 &printUsage('DSpace home path given doesn\'t exist or isn\'t a directory');
83 }
84 }
85 elsif ('-workers' eq $argument)
86 {
87 $i++;
88 $worker_count = $ARGV[$i];
89 if ($worker_count !~ /^\d+$/)
90 {
91 &printUsage('Worker count must be an integer');
92 }
93 }
94 elsif ('-debug' eq $argument)
95 {
96 $debug = 1;
97 }
98 else
99 {
100 &printUsage('Unrecognized argument: ' . $argument);
101 }
102 }
103 if ('' eq $dspace_home)
104 {
105 &printUsage('Path to dspace home required!');
106 }
107
108 # 2. Run metadata export to get the identifiers of video items in the
109 # collection
110 print ' * Retrieving the list of video item identifiers...' . "\n";
111 my @video_item_identifiers;
112 my $metadata_file_path = $dspace_home . '/log/metadata-' . time() . '.csv';
113 my $metadata_export_cmd = '"' . $dspace_home . '/bin/dspace" metadata-export -f "' . $metadata_file_path . '"';
114 if ($debug)
115 {
116 print '[DEBUG: ' . $metadata_export_cmd . "]\n";
117 }
118 `$metadata_export_cmd`;
119 open(FIN, "<:utf8", $metadata_file_path) || die('Error! Failed to open metadata file for reading: ' . $metadata_file_path);
120 my $line = '';
121 while ($line = <FIN>)
122 {
123 if ($line =~ /\"http:\/\/hdl.handle.net\/([^\"]+)\",.*\"Video\"/)
124 {
125 my $identifier = $1;
126 push(@video_item_identifiers, $identifier);
127 if ($debug)
128 {
129 print '[DEBUG: parsed identifier: ' . $identifier . "]\n";
130 }
131 }
132 }
133 close(FIN);
134 unlink($metadata_file_path);
135
136 if ($worker_count > 0)
137 {
138 print " - Parallel processing video files...\n";
139 # 3. Write the identifiers to the file list
140 my $file_list_path = $dspace_home . '/log/filelist' . time() . '.txt';
141 open(FLOUT, ">:utf8", $file_list_path) or die('Error! Failed to open file for writing: ' . $file_list_path);
142 foreach my $video_item_identifier (@video_item_identifiers)
143 {
144 print FLOUT $video_item_identifier . "\n";
145 }
146 close(FLOUT);
147 # 4. Invoke mpidspacemediafilter (via mpirun) with the the filelist
148 # created above. MPIRun takes the number of worker threads to spawn
149
150 # Determine if we've been provided a mpi.conf file to indicate the other
151 # machines (slave nodes) this parallizable process should run on
152 my $mpi_conf_path = $dspace_home . '/mpi.conf';
153 my $mpi_flags = '--show-progress --timestamp-output ';
154 if (-f $mpi_conf_path)
155 {
156 print STDERR " ***** CLUSTER MODE *****\n";
157 $mpi_flags .= '-machinefile "' . $mpi_conf_path . '" ';
158 #$mpi_flags .= '-nolocal ';
159 }
160 else
161 {
162 print STDERR " ***** SINGLE COMPUTER MODE *****\n";
163 }
164 # Excessive force! Ensure we bind to the correct network interface
165 $mpi_flags .= '--mca btl tcp,sm,self --mca btl_tcp_if_include eth0 ';
166
167 my $mpi_cmd = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' "' . $dspace_home . '/bin/mpidspacemediafilter" "' . $ENV{'GSDLHOME'} . '" "' . $dspace_home . '" "' . $file_list_path . '"';
168 if ($debug)
169 {
170 print '[DEBUG: ' . $mpi_cmd . "\n";
171 }
172 `$mpi_cmd`;
173 unlink($file_list_path);
174 }
175 # 5. With 0 worker threads we run a serial process calling media filter
176 # directly
177 else
178 {
179 print " - Serial processing video files...\n";
180 my $serial_mediafilter_cmd = '';
181 foreach my $video_item_identifier (@video_item_identifiers)
182 {
183 print ' - processing item: ' . $video_item_identifier . "\n";
184 $serial_mediafilter_cmd = '"' . $dspace_home . '/bin/dspace" filter-media -f -i "' . $video_item_identifier . '"';
185 if ($debug)
186 {
187 print '[DEBUG: ' . $serial_mediafilter_cmd . "]\n";
188 }
189 `$serial_mediafilter_cmd`;
190 }
191 }
192 # 6. Complete!
193 print 'Parallel Media Filter Complete: ' . `date` . "\n";
194}
195
196&main();
Note: See TracBrowser for help on using the repository browser.