root/gs2-extensions/parallel-building/trunk/src/bin/script/parallel_dspace_filtermedia.pl @ 25943

Revision 25943, 6.2 KB (checked in by jmt12, 8 years ago)

Updated script to support cluster processing

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# Enhance the DSpace media filter component with parallel processing
6# capability. Currently only processes media of dc.type "Video" (which will
7# be manually assigned to the *.TS files produced by ReplayMe!)
8#
9# Copyright (C) 2012 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package parallel_dspace_filtermedia;
28
29BEGIN {
30    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
36
37    if (defined $ENV{'GSDLEXTS'}) {
38    my @extensions = split(/:/,$ENV{'GSDLEXTS'});
39    foreach my $e (@extensions) {
40        my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
41
42        unshift (@INC, "$ext_prefix/perllib");
43        unshift (@INC, "$ext_prefix/perllib/cpan");
44        unshift (@INC, "$ext_prefix/perllib/plugins");
45        unshift (@INC, "$ext_prefix/perllib/plugouts");
46    }
47    }
48}
49
50use strict;
51
52sub printUsage
53{
54  my ($message) = @_;
55  if (defined $message)
56  {
57    print STDERR 'Error! ' . $message . "\n";
58  }
59  print STDERR 'Usage: parallel_dspace_filtermedia.pl -workers <int> -dspacehome <path> [-debug]' . "\n\n";
60  exit(0);
61}
62
63sub main
64{
65    print 'Parallel Media Filter started: ' . `date` . "\n";
66
67    # 1. Initialization
68    my $dspace_home = '';
69    my $worker_count = 0;
70    my $debug = 0;
71    my $i;
72    my $argument;
73    for ($i = 0; $i < scalar(@ARGV); $i++)
74    {
75      $argument = $ARGV[$i];
76      if ('-dspacehome' eq $argument)
77      {
78        $i++;
79        $dspace_home = $ARGV[$i];
80        if (!-d $dspace_home)
81        {
82          &printUsage('DSpace home path given doesn\'t exist or isn\'t a directory');
83        }
84      }
85      elsif ('-workers' eq $argument)
86      {
87        $i++;
88        $worker_count = $ARGV[$i];
89        if ($worker_count !~ /^\d+$/)
90        {
91          &printUsage('Worker count must be an integer');
92        }
93      }
94      elsif ('-debug' eq $argument)
95      {
96        $debug = 1;
97      }
98      else
99      {
100        &printUsage('Unrecognized argument: ' . $argument);
101      }
102    }
103    if ('' eq $dspace_home)
104    {
105      &printUsage('Path to dspace home required!');
106    }
107
108    # 2. Run metadata export to get the identifiers of video items in the
109    #    collection
110    print ' * Retrieving the list of video item identifiers...' . "\n";
111    my @video_item_identifiers;
112    my $metadata_file_path = $dspace_home . '/log/metadata-' . time() . '.csv';
113    my $metadata_export_cmd = '"' . $dspace_home . '/bin/dspace" metadata-export -f "' . $metadata_file_path . '"';
114    if ($debug)
115    {
116      print '[DEBUG: ' . $metadata_export_cmd . "]\n";
117    }
118    `$metadata_export_cmd`;
119    open(FIN, "<:utf8", $metadata_file_path) || die('Error! Failed to open metadata file for reading: ' . $metadata_file_path);
120    my $line = '';
121    while ($line = <FIN>)
122    {
123      if ($line =~ /\"http:\/\/hdl.handle.net\/([^\"]+)\",.*\"Video\"/)
124      {
125        my $identifier = $1;
126        push(@video_item_identifiers, $identifier);
127        if ($debug)
128        {
129          print '[DEBUG: parsed identifier: ' . $identifier . "]\n";
130        }
131      }
132    }
133    close(FIN);
134    unlink($metadata_file_path);
135
136    if ($worker_count > 0)
137    {
138      print " - Parallel processing video files...\n";
139      # 3. Write the identifiers to the file list
140      my $file_list_path = $dspace_home . '/log/filelist' . time() . '.txt';
141      open(FLOUT, ">:utf8", $file_list_path) or die('Error! Failed to open file for writing: ' . $file_list_path);
142      foreach my $video_item_identifier (@video_item_identifiers)
143      {
144        print FLOUT $video_item_identifier . "\n";
145      }
146      close(FLOUT);
147      # 4. Invoke mpidspacemediafilter (via mpirun) with the the filelist
148      #    created above. MPIRun takes the number of worker threads to spawn
149
150      # Determine if we've been provided a mpi.conf file to indicate the other
151      # machines (slave nodes) this parallizable process should run on
152      my $mpi_conf_path = &util::filename_cat($dspace_home, 'mpi.conf');
153      my $mpi_flags = '--show-progress --timestamp-output --verbose --report-bindings --tag-output';
154      if (-f $mpi_conf_path)
155      {
156        print STDERR " ***** CLUSTER MODE *****\n";
157        $mpi_flags .= ' -nolocal -machinefile "' . $mpi_conf_path . '"';
158      }
159      else
160      {
161        print STDERR " ***** SINGLE COMPUTER MODE *****\n";
162      }
163
164      my $mpi_cmd = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' "' . $dspace_home . '/bin/mpidspacemediafilter" "' . $dspace_home . '" "' . $file_list_path . '"';
165      if ($debug)
166      {
167        print '[DEBUG: ' . $mpi_cmd . "\n";
168      }
169      `$mpi_cmd`;
170      unlink($file_list_path);
171    }
172    # 5. With 0 worker threads we run a serial process calling media filter
173    #    directly
174    else
175    {
176      print " - Serial processing video files...\n";
177      my $serial_mediafilter_cmd = '';
178      foreach my $video_item_identifier (@video_item_identifiers)
179      {
180        print ' - processing item: ' . $video_item_identifier . "\n";
181        $serial_mediafilter_cmd = '"' . $dspace_home . '/bin/dspace" filter-media -f -i "' . $video_item_identifier . '"';
182        if ($debug)
183        {
184          print '[DEBUG: ' . $serial_mediafilter_cmd . "]\n";
185        }
186        `$serial_mediafilter_cmd`;
187      }
188    }
189    # 6. Complete!
190    print 'Parallel Media Filter Complete: ' . `date` . "\n";
191}
192
193&main();
Note: See TracBrowser for help on using the browser.