root/gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl @ 27587

Revision 27587, 6.4 KB (checked in by jmt12, 7 years ago)

Allow debug mode to be enabled from the command line

  • Property svn:executable set to *
Line 
1#!/usr/bin/perl
2
3# Pragma
4use strict;
5use warnings;
6
7# Configuration
8my $debug = 0;
9
10# Requires setup.bash to have been sourced
11BEGIN
12{
13  die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14  die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15  die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16  die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17  die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18  # Ensure Greenstone Perl locations are in INC
19  unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20  unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21  # we'll need the perl version number
22  my ($version_number) = `perl-version.pl`;
23  if (defined $ENV{'GSDLEXTS'})
24  {
25    my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26    foreach my $e (@extensions)
27    {
28      my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29      unshift (@INC, $ext_prefix . '/perllib');
30      unshift (@INC, $ext_prefix . '/perllib/cpan');
31      unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32    }
33  }
34}
35
36# Libraries (depends on unshift above
37use Sort::Key::Natural qw(natsort);
38
39# Begin
40print "===== Parse Hadoop Log =====\n";
41
42# 0. Init
43if (!defined $ARGV[0])
44{
45  die("usage: parse_task_info_from_hadoop_log.pl <results dir> [-debug]\n");
46}
47my $results_dir = $ARGV[0];
48if (!-d $results_dir)
49{
50  die("Error! Can't find results directory: " . $results_dir . "\n");
51}
52print " Results directory: " . $results_dir . "\n";
53
54if (defined $ARGV[1] && $ARGV[1] eq '-debug')
55{
56  $debug = 1;
57}
58
59# 1. Determine job ID
60my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
61if (!-e $hadoop_log_path)
62{
63  die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
64}
65print " Hadoop log path: " . $hadoop_log_path . "\n";
66print " * Determine JobID: ";
67my $job_id;
68my $result = `grep "Running job:" "$hadoop_log_path"`;
69if ($result =~ /Running job: job_(\d+_\d+)/)
70{
71  $job_id = $1;
72}
73else
74{
75  die("Error! Failed to locate JobID\n");
76}
77print $job_id . "\n";
78# - we'll need the date to locate the appopriate log file
79my $log_date_suffix = '';
80if ($job_id =~ /^(\d\d\d\d)(\d\d)(\d\d)/)
81{
82  $log_date_suffix = '.' . $1 . '-' . $2 . '-' . $3;
83}
84
85# 2. Determine appropriate job tracker log
86my $username = `whoami`;
87chomp($username);
88print " Username: " . $username . "\n";
89my $hostname = `hostname`;
90chomp($hostname);
91print " Hostname: " . $hostname . "\n";
92my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log' . $log_date_suffix);
93if (!-e $jobtracker_log_path)
94{
95  $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
96  if (!-e $jobtracker_log_path)
97  {
98    die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
99  }
100}
101print " Jobtracker log path: " . $jobtracker_log_path . "\n";
102my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
103print " Report path: " . $data_locality_report_path . "\n";
104
105# 3. Parse log
106print " * Parse JobTracker Log... ";
107my $tid_2_splits = {};
108my $tid_2_node = {};
109my $aid_2_node = {};
110if (open(JTLIN, '<', $jobtracker_log_path))
111{
112  my $line = '';
113  while ($line = <JTLIN>)
114  {
115    # Tips provide a match between task and file splits
116    if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/)
117    {
118      my $task_id = $job_id . $1;
119      my $compute_node = $2;
120      &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
121      if (!defined $tid_2_splits->{$task_id})
122      {
123        $tid_2_splits->{$task_id} = [$compute_node];
124      }
125      else
126      {
127        push(@{$tid_2_splits->{$task_id}}, $compute_node);
128      }
129    }
130    # JobTracker (MAP) entries give us a mapping between task, attempt, and
131    # compute node
132    if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
133    {
134      my $task_id = $job_id . $1;
135      my $attempt_id = $job_id . $1 . $2;
136      my $compute_node = $3;
137      &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
138      $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
139                                    'succeeded' => 0
140                                   };
141    }
142    # Watch for attempt successes (so we can weed out failures)
143    if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
144    {
145      my $attempt_id = $job_id . $1;
146      &debugPrint('successful attempt: ' . $attempt_id);
147      if (defined $aid_2_node->{$attempt_id})
148      {
149        $aid_2_node->{$attempt_id}->{'succeeded'} = 1;
150      }
151    }
152  }
153  close(JTLIN);
154}
155else
156{
157  die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
158}
159print "Done\n";
160
161
162# 4. Write CSV of information
163print " * Writing Job Information... ";
164&debugPrint("\nAttemptID\tComputeNode\tSucceeded");
165foreach my $attempt_id (keys %{$aid_2_node})
166{
167  &debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
168}
169&debugPrint("TaskID\tComputeNodeSplits");
170foreach my $task_id (keys %{$tid_2_splits})
171{
172  &debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
173}
174
175# - open the CSV file and write out the combined information from above
176if (open(CSVOUT, '>:utf8', $data_locality_report_path))
177{
178  print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
179  foreach my $attempt_id (natsort(keys %{$aid_2_node}))
180  {
181    my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
182    my $task_id = $job_id . '_m_' . $task_number;
183    my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
184    my @splits = @{$tid_2_splits->{$task_id}};
185    my $data_local = 0;
186    if (grep($_ eq $compute_node, @splits))
187    {
188      $data_local = 1;
189    }
190    print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . ",\"" . $compute_node . "\",\"" . join(',', natsort(@splits)) . "\"\n";
191  }
192  close(CSVOUT);
193}
194else
195{
196  die("Error! Failed to open file for writing: " . $data_locality_report_path);
197}
198print "Done\n";
199
200# 5. Done
201print "===== Complete! =====\n\n";
202exit;
203
204# Subs
205
206sub debugPrint
207{
208  my ($msg) = @_;
209  if ($debug)
210  {
211    print '[debug] ' . $msg . "\n";
212  }
213}
214
215sub fileCat
216{
217  my $path = join('/', @_);
218  $path =~ s/\/\/+/\//g;
219  return $path;
220}
Note: See TracBrowser for help on using the browser.