Context Navigation

source: gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl@ 27041

Last change on this file since 27041 was 27041, checked in by jmt12, 11 years ago
INC path now includes the installed extensions perl path (including looking for the correct version number)
Property svn:executable set to ``*
File size: 6.0 KB

Line
1	#!/usr/bin/perl
2
3	# Pragma
4	use strict;
5	use warnings;
6
7	# Configuration
8	my $debug = 0;
9
10	# Requires setup.bash to have been sourced
11	BEGIN
12	{
13	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15	die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16	die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17	die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18	# Ensure Greenstone Perl locations are in INC
19	unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20	unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21	# we'll need the perl version number
22	my ($version_number) = $^V =~ /(\d+\.\d+\.\d+)/;
23	if (defined $ENV{'GSDLEXTS'})
24	{
25	my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26	foreach my $e (@extensions)
27	{
28	my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29	unshift (@INC, $ext_prefix . '/perllib');
30	unshift (@INC, $ext_prefix . '/perllib/cpan');
31	unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32	}
33	}
34	}
35
36	# Libraries (depends on unshift above
37	use Sort::Key::Natural qw(natsort);
38
39	# Begin
40	print "===== Parse Hadoop Log =====\n";
41
42	# 0. Init
43	if (!defined $ARGV[0])
44	{
45	die("usage: parse_task_info_from_hadoop_log.pl <results dir>\n");
46	}
47	my $results_dir = $ARGV[0];
48	if (!-d $results_dir)
49	{
50	die("Error! Can't find results directory: " . $results_dir . "\n");
51	}
52	print " Results directory: " . $results_dir . "\n";
53
54	my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
55	if (!-e $hadoop_log_path)
56	{
57	die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
58	}
59	print " Hadoop log path: " . $hadoop_log_path . "\n";
60
61	my $username = `whoami`;
62	chomp($username);
63	print " Username: " . $username . "\n";
64	my $hostname = `hostname`;
65	chomp($hostname);
66	print " Hostname: " . $hostname . "\n";
67	my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
68	if (!-e $jobtracker_log_path)
69	{
70	die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
71	}
72	print " Jobtracker log path: " . $jobtracker_log_path . "\n";
73	my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
74	print " Report path: " . $data_locality_report_path . "\n";
75
76	# 1. Determine job ID
77	print " * Determine JobID: ";
78	my $job_id;
79	my $result = `grep "Running job:" "$hadoop_log_path"`;
80	if ($result =~ /Running job: job_(\d+_\d+)/)
81	{
82	$job_id = $1;
83	}
84	else
85	{
86	die("Error! Failed to locate JobID\n");
87	}
88	print $job_id . "\n";
89
90	# 2. Parse log
91	print " * Parse JobTracker Log... ";
92	my $tid_2_splits = {};
93	my $tid_2_node = {};
94	my $aid_2_node = {};
95	if (open(JTLIN, '<', $jobtracker_log_path))
96	{
97	my $line = '';
98	while ($line = <JTLIN>)
99	{
100	# Tips provide a match between task and file splits
101	if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.]+).local/)
102	{
103	my $task_id = $job_id . $1;
104	my $compute_node = $2;
105	&debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
106	if (!defined $tid_2_splits->{$task_id})
107	{
108	$tid_2_splits->{$task_id} = [$compute_node];
109	}
110	else
111	{
112	push(@{$tid_2_splits->{$task_id}}, $compute_node);
113	}
114	}
115	# JobTracker (MAP) entries give us a mapping between task, attempt, and
116	# compute node
117	if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
118	{
119	my $task_id = $job_id . $1;
120	my $attempt_id = $job_id . $1 . $2;
121	my $compute_node = $3;
122	&debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
123	$aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
124	'succeeded' => 0
125	};
126	}
127	# Watch for attempt successes (so we can weed out failures)
128	if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
129	{
130	my $attempt_id = $job_id . $1;
131	&debugPrint('successful attempt: ' . $attempt_id);
132	if (defined $aid_2_node->{$attempt_id})
133	{
134	$aid_2_node->{$attempt_id}->{'succeeded'} = 1;
135	}
136	}
137	}
138	close(JTLIN);
139	}
140	else
141	{
142	die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
143	}
144	print "Done\n";
145
146
147	# 3. Write CSV of information
148	print " * Writing Job Information... ";
149	&debugPrint("AttemptID\tComputeNode\tSucceeded");
150	foreach my $attempt_id (keys %{$aid_2_node})
151	{
152	&debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
153	}
154	&debugPrint("TaskID\tComputeNodeSplits");
155	foreach my $task_id (keys %{$tid_2_splits})
156	{
157	&debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
158	}
159	# - open the CSV file and write out the combined information from above
160	if (open(CSVOUT, '>:utf8', $data_locality_report_path))
161	{
162	print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
163	foreach my $attempt_id (natsort(keys %{$aid_2_node}))
164	{
165	my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
166	my $task_id = $job_id . '_m_' . $task_number;
167	my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
168	my @splits = @{$tid_2_splits->{$task_id}};
169	my $data_local = 'N';
170	if (grep($_ eq $compute_node, @splits))
171	{
172	$data_local = 'Y';
173	}
174	print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . "," . $compute_node . ",\"" . join(',', natsort(@splits)) . "\"\n";
175	}
176	close(CSVOUT);
177	}
178	else
179	{
180	die("Error! Failed to open file for writing: " . $data_locality_report_path);
181	}
182	print "Done\n";
183
184	# 4. Done
185	print "===== Complete! =====\n\n";
186	exit;
187
188	# Subs
189
190	sub debugPrint
191	{
192	my ($msg) = @_;
193	if ($debug)
194	{
195	print '[debug] ' . $msg . "\n";
196	}
197	}
198
199	sub fileCat
200	{
201	my $path = join('/', @_);
202	$path =~ s/\/\/+/\//g;
203	return $path;
204	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: