source: gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl@ 27587

Last change on this file since 27587 was 27587, checked in by jmt12, 11 years ago

Allow debug mode to be enabled from the command line

  • Property svn:executable set to *
File size: 6.4 KB
Line 
1#!/usr/bin/perl
2
3# Pragma
4use strict;
5use warnings;
6
7# Configuration
8my $debug = 0;
9
10# Requires setup.bash to have been sourced
11BEGIN
12{
13 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15 die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16 die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17 die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18 # Ensure Greenstone Perl locations are in INC
19 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21 # we'll need the perl version number
22 my ($version_number) = `perl-version.pl`;
23 if (defined $ENV{'GSDLEXTS'})
24 {
25 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26 foreach my $e (@extensions)
27 {
28 my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29 unshift (@INC, $ext_prefix . '/perllib');
30 unshift (@INC, $ext_prefix . '/perllib/cpan');
31 unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32 }
33 }
34}
35
36# Libraries (depends on unshift above
37use Sort::Key::Natural qw(natsort);
38
39# Begin
40print "===== Parse Hadoop Log =====\n";
41
42# 0. Init
43if (!defined $ARGV[0])
44{
45 die("usage: parse_task_info_from_hadoop_log.pl <results dir> [-debug]\n");
46}
47my $results_dir = $ARGV[0];
48if (!-d $results_dir)
49{
50 die("Error! Can't find results directory: " . $results_dir . "\n");
51}
52print " Results directory: " . $results_dir . "\n";
53
54if (defined $ARGV[1] && $ARGV[1] eq '-debug')
55{
56 $debug = 1;
57}
58
59# 1. Determine job ID
60my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
61if (!-e $hadoop_log_path)
62{
63 die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
64}
65print " Hadoop log path: " . $hadoop_log_path . "\n";
66print " * Determine JobID: ";
67my $job_id;
68my $result = `grep "Running job:" "$hadoop_log_path"`;
69if ($result =~ /Running job: job_(\d+_\d+)/)
70{
71 $job_id = $1;
72}
73else
74{
75 die("Error! Failed to locate JobID\n");
76}
77print $job_id . "\n";
78# - we'll need the date to locate the appopriate log file
79my $log_date_suffix = '';
80if ($job_id =~ /^(\d\d\d\d)(\d\d)(\d\d)/)
81{
82 $log_date_suffix = '.' . $1 . '-' . $2 . '-' . $3;
83}
84
85# 2. Determine appropriate job tracker log
86my $username = `whoami`;
87chomp($username);
88print " Username: " . $username . "\n";
89my $hostname = `hostname`;
90chomp($hostname);
91print " Hostname: " . $hostname . "\n";
92my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log' . $log_date_suffix);
93if (!-e $jobtracker_log_path)
94{
95 $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
96 if (!-e $jobtracker_log_path)
97 {
98 die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
99 }
100}
101print " Jobtracker log path: " . $jobtracker_log_path . "\n";
102my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
103print " Report path: " . $data_locality_report_path . "\n";
104
105# 3. Parse log
106print " * Parse JobTracker Log... ";
107my $tid_2_splits = {};
108my $tid_2_node = {};
109my $aid_2_node = {};
110if (open(JTLIN, '<', $jobtracker_log_path))
111{
112 my $line = '';
113 while ($line = <JTLIN>)
114 {
115 # Tips provide a match between task and file splits
116 if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/)
117 {
118 my $task_id = $job_id . $1;
119 my $compute_node = $2;
120 &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
121 if (!defined $tid_2_splits->{$task_id})
122 {
123 $tid_2_splits->{$task_id} = [$compute_node];
124 }
125 else
126 {
127 push(@{$tid_2_splits->{$task_id}}, $compute_node);
128 }
129 }
130 # JobTracker (MAP) entries give us a mapping between task, attempt, and
131 # compute node
132 if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
133 {
134 my $task_id = $job_id . $1;
135 my $attempt_id = $job_id . $1 . $2;
136 my $compute_node = $3;
137 &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
138 $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
139 'succeeded' => 0
140 };
141 }
142 # Watch for attempt successes (so we can weed out failures)
143 if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
144 {
145 my $attempt_id = $job_id . $1;
146 &debugPrint('successful attempt: ' . $attempt_id);
147 if (defined $aid_2_node->{$attempt_id})
148 {
149 $aid_2_node->{$attempt_id}->{'succeeded'} = 1;
150 }
151 }
152 }
153 close(JTLIN);
154}
155else
156{
157 die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
158}
159print "Done\n";
160
161
162# 4. Write CSV of information
163print " * Writing Job Information... ";
164&debugPrint("\nAttemptID\tComputeNode\tSucceeded");
165foreach my $attempt_id (keys %{$aid_2_node})
166{
167 &debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
168}
169&debugPrint("TaskID\tComputeNodeSplits");
170foreach my $task_id (keys %{$tid_2_splits})
171{
172 &debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
173}
174
175# - open the CSV file and write out the combined information from above
176if (open(CSVOUT, '>:utf8', $data_locality_report_path))
177{
178 print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
179 foreach my $attempt_id (natsort(keys %{$aid_2_node}))
180 {
181 my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
182 my $task_id = $job_id . '_m_' . $task_number;
183 my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
184 my @splits = @{$tid_2_splits->{$task_id}};
185 my $data_local = 0;
186 if (grep($_ eq $compute_node, @splits))
187 {
188 $data_local = 1;
189 }
190 print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . ",\"" . $compute_node . "\",\"" . join(',', natsort(@splits)) . "\"\n";
191 }
192 close(CSVOUT);
193}
194else
195{
196 die("Error! Failed to open file for writing: " . $data_locality_report_path);
197}
198print "Done\n";
199
200# 5. Done
201print "===== Complete! =====\n\n";
202exit;
203
204# Subs
205
206sub debugPrint
207{
208 my ($msg) = @_;
209 if ($debug)
210 {
211 print '[debug] ' . $msg . "\n";
212 }
213}
214
215sub fileCat
216{
217 my $path = join('/', @_);
218 $path =~ s/\/\/+/\//g;
219 return $path;
220}
Note: See TracBrowser for help on using the repository browser.