Context Navigation

source: gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl@ 27124

Last change on this file since 27124 was 27124, checked in by jmt12, 11 years ago
Use the new perl version script to extract the version number - so as to allow us to load extension specific perl modules
Property svn:executable set to ``*
File size: 6.0 KB

Line
1	#!/usr/bin/perl
2
3	# Pragma
4	use strict;
5	use warnings;
6
7	# Configuration
8	my $debug = 0;
9
10	# Requires setup.bash to have been sourced
11	BEGIN
12	{
13	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15	die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16	die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17	die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18	# Ensure Greenstone Perl locations are in INC
19	unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20	unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21	# we'll need the perl version number
22	my ($version_number) = `perl-version.pl`;
23	if (defined $ENV{'GSDLEXTS'})
24	{
25	my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26	foreach my $e (@extensions)
27	{
28	my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29	unshift (@INC, $ext_prefix . '/perllib');
30	unshift (@INC, $ext_prefix . '/perllib/cpan');
31	unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32	}
33	}
34	}
35
36	# Libraries (depends on unshift above
37	use Sort::Key::Natural qw(natsort);
38
39	# Begin
40	print "===== Parse Hadoop Log =====\n";
41
42	# 0. Init
43	if (!defined $ARGV[0])
44	{
45	die("usage: parse_task_info_from_hadoop_log.pl <results dir>\n");
46	}
47	my $results_dir = $ARGV[0];
48	if (!-d $results_dir)
49	{
50	die("Error! Can't find results directory: " . $results_dir . "\n");
51	}
52	print " Results directory: " . $results_dir . "\n";
53
54	my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
55	if (!-e $hadoop_log_path)
56	{
57	die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
58	}
59	print " Hadoop log path: " . $hadoop_log_path . "\n";
60
61	my $username = `whoami`;
62	chomp($username);
63	print " Username: " . $username . "\n";
64	my $hostname = `hostname`;
65	chomp($hostname);
66	print " Hostname: " . $hostname . "\n";
67	my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
68	if (!-e $jobtracker_log_path)
69	{
70	die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
71	}
72	print " Jobtracker log path: " . $jobtracker_log_path . "\n";
73	my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
74	print " Report path: " . $data_locality_report_path . "\n";
75
76	# 1. Determine job ID
77	print " * Determine JobID: ";
78	my $job_id;
79	my $result = `grep "Running job:" "$hadoop_log_path"`;
80	if ($result =~ /Running job: job_(\d+_\d+)/)
81	{
82	$job_id = $1;
83	}
84	else
85	{
86	die("Error! Failed to locate JobID\n");
87	}
88	print $job_id . "\n";
89
90	# 2. Parse log
91	print " * Parse JobTracker Log... ";
92	my $tid_2_splits = {};
93	my $tid_2_node = {};
94	my $aid_2_node = {};
95	if (open(JTLIN, '<', $jobtracker_log_path))
96	{
97	my $line = '';
98	while ($line = <JTLIN>)
99	{
100	# Tips provide a match between task and file splits
101	if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.]+).local/)
102	{
103	my $task_id = $job_id . $1;
104	my $compute_node = $2;
105	&debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
106	if (!defined $tid_2_splits->{$task_id})
107	{
108	$tid_2_splits->{$task_id} = [$compute_node];
109	}
110	else
111	{
112	push(@{$tid_2_splits->{$task_id}}, $compute_node);
113	}
114	}
115	# JobTracker (MAP) entries give us a mapping between task, attempt, and
116	# compute node
117	if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
118	{
119	my $task_id = $job_id . $1;
120	my $attempt_id = $job_id . $1 . $2;
121	my $compute_node = $3;
122	&debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
123	$aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
124	'succeeded' => 0
125	};
126	}
127	# Watch for attempt successes (so we can weed out failures)
128	if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
129	{
130	my $attempt_id = $job_id . $1;
131	&debugPrint('successful attempt: ' . $attempt_id);
132	if (defined $aid_2_node->{$attempt_id})
133	{
134	$aid_2_node->{$attempt_id}->{'succeeded'} = 1;
135	}
136	}
137	}
138	close(JTLIN);
139	}
140	else
141	{
142	die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
143	}
144	print "Done\n";
145
146
147	# 3. Write CSV of information
148	print " * Writing Job Information... ";
149	&debugPrint("AttemptID\tComputeNode\tSucceeded");
150	foreach my $attempt_id (keys %{$aid_2_node})
151	{
152	&debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
153	}
154	&debugPrint("TaskID\tComputeNodeSplits");
155	foreach my $task_id (keys %{$tid_2_splits})
156	{
157	&debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
158	}
159	# - open the CSV file and write out the combined information from above
160	if (open(CSVOUT, '>:utf8', $data_locality_report_path))
161	{
162	print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
163	foreach my $attempt_id (natsort(keys %{$aid_2_node}))
164	{
165	my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
166	my $task_id = $job_id . '_m_' . $task_number;
167	my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
168	my @splits = @{$tid_2_splits->{$task_id}};
169	my $data_local = 'N';
170	if (grep($_ eq $compute_node, @splits))
171	{
172	$data_local = 'Y';
173	}
174	print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . "," . $compute_node . ",\"" . join(',', natsort(@splits)) . "\"\n";
175	}
176	close(CSVOUT);
177	}
178	else
179	{
180	die("Error! Failed to open file for writing: " . $data_locality_report_path);
181	}
182	print "Done\n";
183
184	# 4. Done
185	print "===== Complete! =====\n\n";
186	exit;
187
188	# Subs
189
190	sub debugPrint
191	{
192	my ($msg) = @_;
193	if ($debug)
194	{
195	print '[debug] ' . $msg . "\n";
196	}
197	}
198
199	sub fileCat
200	{
201	my $path = join('/', @_);
202	$path =~ s/\/\/+/\//g;
203	return $path;
204	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: