source: gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl@ 27124

Last change on this file since 27124 was 27124, checked in by jmt12, 11 years ago

Use the new perl version script to extract the version number - so as to allow us to load extension specific perl modules

  • Property svn:executable set to *
File size: 6.0 KB
Line 
1#!/usr/bin/perl
2
3# Pragma
4use strict;
5use warnings;
6
7# Configuration
8my $debug = 0;
9
10# Requires setup.bash to have been sourced
11BEGIN
12{
13 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15 die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16 die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17 die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18 # Ensure Greenstone Perl locations are in INC
19 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20 unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21 # we'll need the perl version number
22 my ($version_number) = `perl-version.pl`;
23 if (defined $ENV{'GSDLEXTS'})
24 {
25 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26 foreach my $e (@extensions)
27 {
28 my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29 unshift (@INC, $ext_prefix . '/perllib');
30 unshift (@INC, $ext_prefix . '/perllib/cpan');
31 unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32 }
33 }
34}
35
36# Libraries (depends on unshift above
37use Sort::Key::Natural qw(natsort);
38
39# Begin
40print "===== Parse Hadoop Log =====\n";
41
42# 0. Init
43if (!defined $ARGV[0])
44{
45 die("usage: parse_task_info_from_hadoop_log.pl <results dir>\n");
46}
47my $results_dir = $ARGV[0];
48if (!-d $results_dir)
49{
50 die("Error! Can't find results directory: " . $results_dir . "\n");
51}
52print " Results directory: " . $results_dir . "\n";
53
54my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
55if (!-e $hadoop_log_path)
56{
57 die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
58}
59print " Hadoop log path: " . $hadoop_log_path . "\n";
60
61my $username = `whoami`;
62chomp($username);
63print " Username: " . $username . "\n";
64my $hostname = `hostname`;
65chomp($hostname);
66print " Hostname: " . $hostname . "\n";
67my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
68if (!-e $jobtracker_log_path)
69{
70 die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
71}
72print " Jobtracker log path: " . $jobtracker_log_path . "\n";
73my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
74print " Report path: " . $data_locality_report_path . "\n";
75
76# 1. Determine job ID
77print " * Determine JobID: ";
78my $job_id;
79my $result = `grep "Running job:" "$hadoop_log_path"`;
80if ($result =~ /Running job: job_(\d+_\d+)/)
81{
82 $job_id = $1;
83}
84else
85{
86 die("Error! Failed to locate JobID\n");
87}
88print $job_id . "\n";
89
90# 2. Parse log
91print " * Parse JobTracker Log... ";
92my $tid_2_splits = {};
93my $tid_2_node = {};
94my $aid_2_node = {};
95if (open(JTLIN, '<', $jobtracker_log_path))
96{
97 my $line = '';
98 while ($line = <JTLIN>)
99 {
100 # Tips provide a match between task and file splits
101 if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.]+).local/)
102 {
103 my $task_id = $job_id . $1;
104 my $compute_node = $2;
105 &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
106 if (!defined $tid_2_splits->{$task_id})
107 {
108 $tid_2_splits->{$task_id} = [$compute_node];
109 }
110 else
111 {
112 push(@{$tid_2_splits->{$task_id}}, $compute_node);
113 }
114 }
115 # JobTracker (MAP) entries give us a mapping between task, attempt, and
116 # compute node
117 if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
118 {
119 my $task_id = $job_id . $1;
120 my $attempt_id = $job_id . $1 . $2;
121 my $compute_node = $3;
122 &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
123 $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
124 'succeeded' => 0
125 };
126 }
127 # Watch for attempt successes (so we can weed out failures)
128 if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
129 {
130 my $attempt_id = $job_id . $1;
131 &debugPrint('successful attempt: ' . $attempt_id);
132 if (defined $aid_2_node->{$attempt_id})
133 {
134 $aid_2_node->{$attempt_id}->{'succeeded'} = 1;
135 }
136 }
137 }
138 close(JTLIN);
139}
140else
141{
142 die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
143}
144print "Done\n";
145
146
147# 3. Write CSV of information
148print " * Writing Job Information... ";
149&debugPrint("AttemptID\tComputeNode\tSucceeded");
150foreach my $attempt_id (keys %{$aid_2_node})
151{
152 &debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
153}
154&debugPrint("TaskID\tComputeNodeSplits");
155foreach my $task_id (keys %{$tid_2_splits})
156{
157 &debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
158}
159# - open the CSV file and write out the combined information from above
160if (open(CSVOUT, '>:utf8', $data_locality_report_path))
161{
162 print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
163 foreach my $attempt_id (natsort(keys %{$aid_2_node}))
164 {
165 my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
166 my $task_id = $job_id . '_m_' . $task_number;
167 my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
168 my @splits = @{$tid_2_splits->{$task_id}};
169 my $data_local = 'N';
170 if (grep($_ eq $compute_node, @splits))
171 {
172 $data_local = 'Y';
173 }
174 print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . "," . $compute_node . ",\"" . join(',', natsort(@splits)) . "\"\n";
175 }
176 close(CSVOUT);
177}
178else
179{
180 die("Error! Failed to open file for writing: " . $data_locality_report_path);
181}
182print "Done\n";
183
184# 4. Done
185print "===== Complete! =====\n\n";
186exit;
187
188# Subs
189
190sub debugPrint
191{
192 my ($msg) = @_;
193 if ($debug)
194 {
195 print '[debug] ' . $msg . "\n";
196 }
197}
198
199sub fileCat
200{
201 my $path = join('/', @_);
202 $path =~ s/\/\/+/\//g;
203 return $path;
204}
Note: See TracBrowser for help on using the repository browser.