Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

parse_task_info_from_hadoop_log.pl

Last change on this file was 28014, checked in by jmt12, 11 years ago
Remove tasks that have had data locality established from the array of pending tasks
Property svn:executable set to ``*
File size: 9.3 KB

Line
1	#!/usr/bin/perl
2
3	# Pragma
4	use strict;
5	use warnings;
6
7	# Configuration
8	my $debug = 0;
9
10	# Requires setup.bash to have been sourced
11	BEGIN
12	{
13	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
14	die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
15	die "HADOOP_PREFIX not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HADOOP_PREFIX'};
16	die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
17	die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
18	# Ensure Greenstone Perl locations are in INC
19	unshift (@INC, $ENV{'GSDLHOME'} . '/perllib');
20	unshift (@INC, $ENV{'GSDLHOME'} . '/perllib/cpan');
21	# we'll need the perl version number
22	my ($version_number) = `perl-version.pl`;
23	if (defined $ENV{'GSDLEXTS'})
24	{
25	my @extensions = split(/:/,$ENV{'GSDLEXTS'});
26	foreach my $e (@extensions)
27	{
28	my $ext_prefix = $ENV{'GSDLHOME'} . '/ext/' . $e;
29	unshift (@INC, $ext_prefix . '/perllib');
30	unshift (@INC, $ext_prefix . '/perllib/cpan');
31	unshift (@INC, $ext_prefix . '/' . $ENV{'GSDLOS'} . '/lib/perl/' . $version_number);
32	}
33	}
34	}
35
36	# Libraries (depends on unshift above
37	use Sort::Key::Natural qw(natsort);
38
39	# Begin
40	print "===== Parse Hadoop Log =====\n";
41
42	# 0. Init
43	if (!defined $ARGV[0])
44	{
45	die("usage: parse_task_info_from_hadoop_log.pl <results dir> [-debug]\n");
46	}
47	my $results_dir = $ARGV[0];
48	if (!-d $results_dir)
49	{
50	die("Error! Can't find results directory: " . $results_dir . "\n");
51	}
52	print " Results directory: " . $results_dir . "\n";
53
54	if (defined $ARGV[1] && $ARGV[1] eq '-debug')
55	{
56	$debug = 1;
57	}
58
59	# 1. Determine job ID
60	my $hadoop_log_path = &fileCat($results_dir, 'hadoop.log');
61	if (!-e $hadoop_log_path)
62	{
63	die("Error! Hadoop log file cannot be found: " . $hadoop_log_path . "\n");
64	}
65	print " Hadoop log path: " . $hadoop_log_path . "\n";
66	print " * Determine JobID: ";
67	my $job_id;
68	my $result = `grep "Running job:" "$hadoop_log_path"`;
69	if ($result =~ /Running job: job_(\d+_\d+)/)
70	{
71	$job_id = $1;
72	}
73	else
74	{
75	die("Error! Failed to locate JobID\n");
76	}
77	print $job_id . "\n";
78	# - we'll need the date to locate the appopriate log file
79	my $log_date_year = 0;
80	my $log_date_month = 0;
81	my $log_date_day = 0;
82	if ($job_id =~ /^(\d\d\d\d)(\d\d)(\d\d)/)
83	{
84	$log_date_year = $1;
85	$log_date_month = $2;
86	$log_date_day = $3;
87	}
88	else
89	{
90	die('Error! Failed to parse date from Job ID: ' . $job_id . "\n");
91	}
92
93	# 2. Determine user and system details
94	my $username = `whoami`;
95	chomp($username);
96	print " Username: " . $username . "\n";
97	my $hostname = `hostname`;
98	chomp($hostname);
99	print " Hostname: " . $hostname . "\n";
100	my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
101	print " Report path: " . $data_locality_report_path . "\n";
102
103	# 3. Parse log
104	print " * Parse JobTracker Log(s)...\n";
105	my $tid_2_splits = {};
106	my $tid_2_node = {};
107	my $aid_2_node = {};
108	my $job_complete = 0;
109	my $parsed_latest_log = 0;
110	while (!$job_complete && !$parsed_latest_log)
111	{
112	# - determine appropriate job tracker log
113	my $jobtracker_log_path_prefix = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
114	my $jobtracker_log_path = sprintf('%s.%04d-%02d-%02d', $jobtracker_log_path_prefix, $log_date_year, $log_date_month, $log_date_day);
115	# - maybe the log hasn't been rolled yet
116	if (!-e $jobtracker_log_path)
117	{
118	$jobtracker_log_path = $jobtracker_log_path_prefix;
119	# - nope, no applicable log found
120	if (!-e $jobtracker_log_path)
121	{
122	die('Error! Hadoop JobTracker log file cannot be found: ' . $jobtracker_log_path . "\n");
123	}
124	else
125	{
126	$parsed_latest_log = 1;
127	}
128	}
129	print " - parsing JobTracker log: " . $jobtracker_log_path . "\n";
130
131	if (open(JTLIN, '<', $jobtracker_log_path))
132	{
133	my $line = '';
134	while ($line = <JTLIN>)
135	{
136	# Tips provide a match between task and file splits
137	if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/)
138	{
139	my $task_id = $job_id . $1;
140	my $compute_node = $2;
141	&debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
142	if (!defined $tid_2_splits->{$task_id})
143	{
144	$tid_2_splits->{$task_id} = [$compute_node];
145	}
146	else
147	{
148	push(@{$tid_2_splits->{$task_id}}, $compute_node);
149	}
150	}
151	# JobTracker (MAP) entries give us a mapping between task, attempt, and
152	# compute node
153	elsif ($line =~ /Adding task $MAP$ 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
154	{
155	my $task_id = $job_id . $1;
156	my $attempt_id = $job_id . $1 . $2;
157	my $compute_node = $3;
158	&debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
159	$aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
160	'succeeded' => 0
161	};
162	}
163	# Watch for attempt successes (so we can weed out failures)
164	elsif ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
165	{
166	my $attempt_id = $job_id . $1;
167	&debugPrint('successful attempt: ' . $attempt_id);
168	if (defined $aid_2_node->{$attempt_id})
169	{
170	$aid_2_node->{$attempt_id}->{'succeeded'} = 1;
171	}
172	}
173	# And job completion... so we can keep parsing other log files as
174	# necessary
175	elsif ($line =~ /Job job_${job_id} has completed successfully\./)
176	{
177	$job_complete = 1;
178	}
179	}
180	close(JTLIN);
181	}
182	else
183	{
184	die('Error! Failed to open JobTracker log for reading: ' . $jobtracker_log_path . "\n");
185	}
186	# Increment the day by one - unfortunately this leads to unpleasant date
187	# maths to ensure month and year roll over appropriately too
188	$log_date_day++;
189	# On to a new month?
190	# Leap Year Feb > 29 otherwise Feb > 28
191	# Apr, Jun, Sep, Nov > 30
192	# Rest > 31
193	if ($log_date_month == 2 && ((&isLeapYear($log_date_year) && $log_date_day > 29) \|\| $log_date_day > 28))
194	{
195	$log_date_day = 1;
196	$log_date_month++;
197	}
198	elsif (($log_date_month == 4 \|\| $log_date_month == 6 \|\| $log_date_month == 9 \|\| $log_date_month == 11) && $log_date_day > 30)
199	{
200	$log_date_day = 1;
201	$log_date_month++;
202	}
203	elsif ($log_date_day > 31)
204	{
205	$log_date_day = 1;
206	$log_date_month++;
207	}
208	# On to a new year?
209	if ($log_date_month > 12)
210	{
211	$log_date_month = 1;
212	$log_date_year++;
213	}
214	}
215	print " Done\n";
216
217	if (!$job_complete)
218	{
219	print "Warning! Failed to parse in information for a complete job.\n";
220	}
221
222	# 4. Write CSV of information
223	print " * Writing Job Information... ";
224	&debugPrint("AttemptID \tComputeNode\tSucceeded", 1);
225	foreach my $attempt_id (keys %{$aid_2_node})
226	{
227	&debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
228	}
229	&debugPrint("TaskID \tComputeNodeSplits");
230	my $split_counter = 0;
231	foreach my $task_id (keys %{$tid_2_splits})
232	{
233	&debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
234	$split_counter++;
235	}
236	&debugPrint(' * Number of split records: ' . $split_counter);
237	&debugPrint('');
238
239	# - open the CSV file and write out the combined information from above
240	if (open(CSVOUT, '>:utf8', $data_locality_report_path))
241	{
242	print CSVOUT "TaskNo,AttemptNo,Data Local,Compute Node,Splits\n";
243	foreach my $attempt_id (natsort(keys %{$aid_2_node}))
244	{
245	my ($job_id, $task_number, $attempt_number) = $attempt_id =~ /^(\d+_\d+)_m_(\d+)_(\d+)/;
246	my $task_id = $job_id . '_m_' . $task_number;
247	my $compute_node = $aid_2_node->{$attempt_id}->{'compute_node'};
248	if (defined $tid_2_splits->{$task_id})
249	{
250	my @splits = @{$tid_2_splits->{$task_id}};
251	my $data_local = 0;
252	if (grep($_ eq $compute_node, @splits))
253	{
254	$data_local = 1;
255	}
256	print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . ",\"" . $compute_node . "\",\"" . join(',', natsort(@splits)) . "\"\n";
257	delete($tid_2_splits->{$task_id});
258	}
259	else
260	{
261	print "Warning! Missing data location information for task: " . $task_id . "\n";
262	}
263	}
264
265	# Report on any other splits that were recorded in the log, but for unknown
266	# reasons aren't matched with a 'successful' task
267	foreach my $task_id (keys %{$tid_2_splits})
268	{
269	if (defined $tid_2_splits->{$task_id})
270	{
271	my ($job_id, $task_number) = $task_id =~ /^(\d+_\d+)_m_(\d+)/;
272	my @splits = @{$tid_2_splits->{$task_id}};
273	print CSVOUT $task_number . ",-1,-1,\"\",\"" . join(',', natsort(@splits)) . "\"\n";
274	}
275	}
276
277	close(CSVOUT);
278	}
279	else
280	{
281	die("Error! Failed to open file for writing: " . $data_locality_report_path);
282	}
283	print "Done\n";
284
285	# 5. Done
286	print "===== Complete! =====\n\n";
287	exit;
288
289	# Subs
290
291	sub debugPrint
292	{
293	my ($msg, $force_newline) = @_;
294	if ($debug)
295	{
296	if (defined $force_newline && $force_newline)
297	{
298	print "\n[debug] " . $msg . "\n";
299	}
300	else
301	{
302	print '[debug] ' . $msg . "\n";
303	}
304	}
305	}
306
307	sub fileCat
308	{
309	my $path = join('/', @_);
310	$path =~ s/\/\/+/\//g;
311	return $path;
312	}
313
314	## @function isLeapYear()
315	#
316	sub isLeapYear
317	{
318	my ($year) = @_;
319	if ($year % 400 == 0)
320	{
321	return 1;
322	}
323	elsif ($year % 100 == 0)
324	{
325	return 0;
326	}
327	elsif ($year % 4 == 0)
328	{
329	return 1;
330	}
331	return 0;
332	}
333	## isLeapYear() ##
334
335	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl

Download in other formats: