#!/usr/bin/perl -w ########################################################################### # # Enhance the DSpace media filter component with parallel processing # capability. Currently only processes media of dc.type "Video" (which will # be manually assigned to the *.TS files produced by ReplayMe!) # # Copyright (C) 2012 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package parallel_dspace_filtermedia; BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts"); if (defined $ENV{'GSDLEXTS'}) { my @extensions = split(/:/,$ENV{'GSDLEXTS'}); foreach my $e (@extensions) { my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e"; unshift (@INC, "$ext_prefix/perllib"); unshift (@INC, "$ext_prefix/perllib/cpan"); unshift (@INC, "$ext_prefix/perllib/plugins"); unshift (@INC, "$ext_prefix/perllib/plugouts"); } } } use strict; sub printUsage { my ($message) = @_; if (defined $message) { print STDERR 'Error! ' . $message . "\n"; } print STDERR 'Usage: parallel_dspace_filtermedia.pl -workers -dspacehome [-debug]' . "\n\n"; exit(0); } sub main { print 'Parallel Media Filter started: ' . `date` . "\n"; # 1. Initialization my $dspace_home = ''; my $worker_count = 0; my $debug = 0; my $i; my $argument; for ($i = 0; $i < scalar(@ARGV); $i++) { $argument = $ARGV[$i]; if ('-dspacehome' eq $argument) { $i++; $dspace_home = $ARGV[$i]; if (!-d $dspace_home) { &printUsage('DSpace home path given doesn\'t exist or isn\'t a directory'); } } elsif ('-workers' eq $argument) { $i++; $worker_count = $ARGV[$i]; if ($worker_count !~ /^\d+$/) { &printUsage('Worker count must be an integer'); } } elsif ('-debug' eq $argument) { $debug = 1; } else { &printUsage('Unrecognized argument: ' . $argument); } } if ('' eq $dspace_home) { &printUsage('Path to dspace home required!'); } # 2. Run metadata export to get the identifiers of video items in the # collection print ' * Retrieving the list of video item identifiers...' . "\n"; my @video_item_identifiers; my $metadata_file_path = $dspace_home . '/log/metadata-' . time() . '.csv'; my $metadata_export_cmd = '"' . $dspace_home . '/bin/dspace" metadata-export -f "' . $metadata_file_path . '"'; if ($debug) { print '[DEBUG: ' . $metadata_export_cmd . "]\n"; } `$metadata_export_cmd`; open(FIN, "<:utf8", $metadata_file_path) || die('Error! Failed to open metadata file for reading: ' . $metadata_file_path); my $line = ''; while ($line = ) { if ($line =~ /\"http:\/\/hdl.handle.net\/([^\"]+)\",.*\"Video\"/) { my $identifier = $1; push(@video_item_identifiers, $identifier); if ($debug) { print '[DEBUG: parsed identifier: ' . $identifier . "]\n"; } } } close(FIN); unlink($metadata_file_path); if ($worker_count > 0) { print " - Parallel processing video files...\n"; # 3. Write the identifiers to the file list my $file_list_path = $dspace_home . '/log/filelist' . time() . '.txt'; open(FLOUT, ">:utf8", $file_list_path) or die('Error! Failed to open file for writing: ' . $file_list_path); foreach my $video_item_identifier (@video_item_identifiers) { print FLOUT $video_item_identifier . "\n"; } close(FLOUT); # 4. Invoke mpidspacemediafilter (via mpirun) with the the filelist # created above. MPIRun takes the number of worker threads to spawn # Determine if we've been provided a mpi.conf file to indicate the other # machines (slave nodes) this parallizable process should run on my $mpi_conf_path = &util::filename_cat($dspace_home, 'mpi.conf'); my $mpi_flags = '--show-progress --timestamp-output --verbose --report-bindings --tag-output'; if (-f $mpi_conf_path) { print STDERR " ***** CLUSTER MODE *****\n"; $mpi_flags .= ' -nolocal -machinefile "' . $mpi_conf_path . '"'; } else { print STDERR " ***** SINGLE COMPUTER MODE *****\n"; } my $mpi_cmd = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' "' . $dspace_home . '/bin/mpidspacemediafilter" "' . $dspace_home . '" "' . $file_list_path . '"'; if ($debug) { print '[DEBUG: ' . $mpi_cmd . "\n"; } `$mpi_cmd`; unlink($file_list_path); } # 5. With 0 worker threads we run a serial process calling media filter # directly else { print " - Serial processing video files...\n"; my $serial_mediafilter_cmd = ''; foreach my $video_item_identifier (@video_item_identifiers) { print ' - processing item: ' . $video_item_identifier . "\n"; $serial_mediafilter_cmd = '"' . $dspace_home . '/bin/dspace" filter-media -f -i "' . $video_item_identifier . '"'; if ($debug) { print '[DEBUG: ' . $serial_mediafilter_cmd . "]\n"; } `$serial_mediafilter_cmd`; } } # 6. Complete! print 'Parallel Media Filter Complete: ' . `date` . "\n"; } &main();