source: gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl@ 27041

Last change on this file since 27041 was 27041, checked in by jmt12, 11 years ago

INC path now includes the installed extensions perl path (including looking for the correct version number)

  • Property svn:executable set to *
File size: 6.0 KB
Line 
1#!/usr/bin/perl
2
3# Pragma
4use strict;
5use warnings;
6
7# Configuration
8my $debug = 0;
9
10# Requires setup.bash to have been sourced
11BEGIN
12{
13 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15 die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16 die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17 die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18 # Ensure Greenstone Perl locations are in INC
19 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21 # we'll need the perl version number
22 my ($version_number) = $^V =~ /(\d+\.\d+\.\d+)/;
23 if (defined $ENV{'GSDLEXTS'})
24 {
25 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26 foreach my $e (@extensions)
27 {
28 my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29 unshift (@INC, $ext_prefix . '/perllib');
30 unshift (@INC, $ext_prefix . '/perllib/cpan');
31 unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32 }
33 }
34}
35
36# Libraries (depends on unshift above
37use Sort::Key::Natural qw(natsort);
38
39# Begin
40print "===== Parse Hadoop Log =====\n";
41
42# 0. Init
43if (!defined $ARGV[0])
44{
45 die("usage: parse_task_info_from_hadoop_log.pl <results dir>\n");
46}
47my $results_dir = $ARGV[0];
48if (!-d $results_dir)
49{
50 die("Error! Can't find results directory: " . $results_dir . "\n");
51}
52print " Results directory: " . $results_dir . "\n";
53
54my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
55if (!-e $hadoop_log_path)
56{
57 die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
58}
59print " Hadoop log path: " . $hadoop_log_path . "\n";
60
61my $username = `whoami`;
62chomp($username);
63print " Username: " . $username . "\n";
64my $hostname = `hostname`;
65chomp($hostname);
66print " Hostname: " . $hostname . "\n";
67my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
68if (!-e $jobtracker_log_path)
69{
70 die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
71}
72print " Jobtracker log path: " . $jobtracker_log_path . "\n";
73my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
74print " Report path: " . $data_locality_report_path . "\n";
75
76# 1. Determine job ID
77print " * Determine JobID: ";
78my $job_id;
79my $result = `grep "Running job:" "$hadoop_log_path"`;
80if ($result =~ /Running job: job_(\d+_\d+)/)
81{
82 $job_id = $1;
83}
84else
85{
86 die("Error! Failed to locate JobID\n");
87}
88print $job_id . "\n";
89
90# 2. Parse log
91print " * Parse JobTracker Log... ";
92my $tid_2_splits = {};
93my $tid_2_node = {};
94my $aid_2_node = {};
95if (open(JTLIN, '<', $jobtracker_log_path))
96{
97 my $line = '';
98 while ($line = <JTLIN>)
99 {
100 # Tips provide a match between task and file splits
101 if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.]+).local/)
102 {
103 my $task_id = $job_id . $1;
104 my $compute_node = $2;
105 &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
106 if (!defined $tid_2_splits->{$task_id})
107 {
108 $tid_2_splits->{$task_id} = [$compute_node];
109 }
110 else
111 {
112 push(@{$tid_2_splits->{$task_id}}, $compute_node);
113 }
114 }
115 # JobTracker (MAP) entries give us a mapping between task, attempt, and
116 # compute node
117 if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
118 {
119 my $task_id = $job_id . $1;
120 my $attempt_id = $job_id . $1 . $2;
121 my $compute_node = $3;
122 &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
123 $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
124 'succeeded' => 0
125 };
126 }
127 # Watch for attempt successes (so we can weed out failures)
128 if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
129 {
130 my $attempt_id = $job_id . $1;
131 &debugPrint('successful attempt: ' . $attempt_id);
132 if (defined $aid_2_node->{$attempt_id})
133 {
134 $aid_2_node->{$attempt_id}->{'succeeded'} = 1;
135 }
136 }
137 }
138 close(JTLIN);
139}
140else
141{
142 die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
143}
144print "Done\n";
145
146
147# 3. Write CSV of information
148print " * Writing Job Information... ";
149&debugPrint("AttemptID\tComputeNode\tSucceeded");
150foreach my $attempt_id (keys %{$aid_2_node})
151{
152 &debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
153}
154&debugPrint("TaskID\tComputeNodeSplits");
155foreach my $task_id (keys %{$tid_2_splits})
156{
157 &debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
158}
159# - open the CSV file and write out the combined information from above
160if (open(CSVOUT, '>:utf8', $data_locality_report_path))
161{
162 print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
163 foreach my $attempt_id (natsort(keys %{$aid_2_node}))
164 {
165 my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
166 my $task_id = $job_id . '_m_' . $task_number;
167 my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
168 my @splits = @{$tid_2_splits->{$task_id}};
169 my $data_local = 'N';
170 if (grep($_ eq $compute_node, @splits))
171 {
172 $data_local = 'Y';
173 }
174 print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . "," . $compute_node . ",\"" . join(',', natsort(@splits)) . "\"\n";
175 }
176 close(CSVOUT);
177}
178else
179{
180 die("Error! Failed to open file for writing: " . $data_locality_report_path);
181}
182print "Done\n";
183
184# 4. Done
185print "===== Complete! =====\n\n";
186exit;
187
188# Subs
189
190sub debugPrint
191{
192 my ($msg) = @_;
193 if ($debug)
194 {
195 print '[debug] ' . $msg . "\n";
196 }
197}
198
199sub fileCat
200{
201 my $path = join('/', @_);
202 $path =~ s/\/\/+/\//g;
203 return $path;
204}
Note: See TracBrowser for help on using the repository browser.