Changeset 27588

Show
Ignore:
Timestamp:
10.06.2013 14:12:28 (6 years ago)
Author:
jmt12
Message:

Extend parser to support jobs that are split over several logs. Also ensure that split information is printed to CSV even for those tasks that appear to have failed (or completely lack attempt information).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl

    r27587 r27588  
    7777print $job_id . "\n"; 
    7878# - we'll need the date to locate the appopriate log file 
    79 my $log_date_suffix = ''; 
     79my $log_date_year = 0; 
     80my $log_date_month = 0; 
     81my $log_date_day = 0; 
    8082if ($job_id =~ /^(\d\d\d\d)(\d\d)(\d\d)/) 
    8183{ 
    82   $log_date_suffix = '.' . $1 . '-' . $2 . '-' . $3; 
    83 } 
    84  
    85 # 2. Determine appropriate job tracker log 
     84  $log_date_year = $1; 
     85  $log_date_month = $2; 
     86  $log_date_day = $3; 
     87} 
     88else 
     89{ 
     90  die('Error! Failed to parse date from Job ID: ' . $job_id . "\n"); 
     91} 
     92 
     93# 2. Determine user and system details 
    8694my $username = `whoami`; 
    8795chomp($username); 
     
    9098chomp($hostname); 
    9199print " Hostname: " . $hostname . "\n"; 
    92 my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log' . $log_date_suffix); 
    93 if (!-e $jobtracker_log_path) 
    94 { 
    95   $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log'); 
    96   if (!-e $jobtracker_log_path) 
    97   { 
    98     die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n"); 
    99   } 
    100 } 
    101 print " Jobtracker log path: " . $jobtracker_log_path . "\n"; 
    102100my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv'); 
    103101print " Report path: " . $data_locality_report_path . "\n"; 
    104102 
    105103# 3. Parse log 
    106 print " * Parse JobTracker Log... "; 
     104print " * Parse JobTracker Log(s)...\n"; 
    107105my $tid_2_splits = {}; 
    108106my $tid_2_node = {}; 
    109107my $aid_2_node = {}; 
    110 if (open(JTLIN, '<', $jobtracker_log_path)) 
    111 { 
    112   my $line = ''; 
    113   while ($line = <JTLIN>) 
    114   { 
    115     # Tips provide a match between task and file splits 
    116     if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/) 
    117     { 
    118       my $task_id = $job_id . $1; 
    119       my $compute_node = $2; 
    120       &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node); 
    121       if (!defined $tid_2_splits->{$task_id}) 
     108my $job_complete = 0; 
     109my $parsed_latest_log = 0; 
     110while (!$job_complete && !$parsed_latest_log) 
     111{ 
     112  # - determine appropriate job tracker log 
     113  my $jobtracker_log_path_prefix = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log'); 
     114  my $jobtracker_log_path = sprintf('%s.%04d-%02d-%02d', $jobtracker_log_path_prefix, $log_date_year, $log_date_month, $log_date_day); 
     115  # - maybe the log hasn't been rolled yet 
     116  if (!-e $jobtracker_log_path) 
     117  { 
     118    $jobtracker_log_path = $jobtracker_log_path_prefix; 
     119    # - nope, no applicable log found 
     120    if (!-e $jobtracker_log_path) 
     121    { 
     122      die('Error! Hadoop JobTracker log file cannot be found: ' . $jobtracker_log_path . "\n"); 
     123    } 
     124    else 
     125    { 
     126      $parsed_latest_log = 1; 
     127    } 
     128  } 
     129  print " - parsing JobTracker log: " . $jobtracker_log_path . "\n"; 
     130 
     131  if (open(JTLIN, '<', $jobtracker_log_path)) 
     132  { 
     133    my $line = ''; 
     134    while ($line = <JTLIN>) 
     135    { 
     136      # Tips provide a match between task and file splits 
     137      if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/) 
    122138      { 
    123         $tid_2_splits->{$task_id} = [$compute_node]; 
     139        my $task_id = $job_id . $1; 
     140        my $compute_node = $2; 
     141        &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node); 
     142        if (!defined $tid_2_splits->{$task_id}) 
     143        { 
     144          $tid_2_splits->{$task_id} = [$compute_node]; 
     145        } 
     146        else 
     147        { 
     148          push(@{$tid_2_splits->{$task_id}}, $compute_node); 
     149        } 
    124150      } 
    125       else 
     151      # JobTracker (MAP) entries give us a mapping between task, attempt, and 
     152      # compute node 
     153      elsif ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/) 
    126154      { 
    127         push(@{$tid_2_splits->{$task_id}}, $compute_node); 
     155        my $task_id = $job_id . $1; 
     156        my $attempt_id = $job_id . $1 . $2; 
     157        my $compute_node = $3; 
     158        &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node); 
     159        $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node, 
     160                                      'succeeded' => 0 
     161                                     }; 
    128162      } 
    129     } 
    130     # JobTracker (MAP) entries give us a mapping between task, attempt, and 
    131     # compute node 
    132     if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/) 
    133     { 
    134       my $task_id = $job_id . $1; 
    135       my $attempt_id = $job_id . $1 . $2; 
    136       my $compute_node = $3; 
    137       &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node); 
    138       $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node, 
    139                                     'succeeded' => 0 
    140                                    }; 
    141     } 
    142     # Watch for attempt successes (so we can weed out failures) 
    143     if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/) 
    144     { 
    145       my $attempt_id = $job_id . $1; 
    146       &debugPrint('successful attempt: ' . $attempt_id); 
    147       if (defined $aid_2_node->{$attempt_id}) 
     163      # Watch for attempt successes (so we can weed out failures) 
     164      elsif ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/) 
    148165      { 
    149         $aid_2_node->{$attempt_id}->{'succeeded'} = 1; 
     166        my $attempt_id = $job_id . $1; 
     167        &debugPrint('successful attempt: ' . $attempt_id); 
     168        if (defined $aid_2_node->{$attempt_id}) 
     169        { 
     170          $aid_2_node->{$attempt_id}->{'succeeded'} = 1; 
     171        } 
    150172      } 
    151     } 
    152   } 
    153   close(JTLIN); 
    154 } 
    155 else 
    156 { 
    157   die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n"); 
     173      # And job completion... so we can keep parsing other log files as 
     174      # necessary 
     175      elsif ($line =~ /Job 'job_${job_id} has completed successfully\./) 
     176      { 
     177        $job_complete = 1; 
     178      } 
     179    } 
     180    close(JTLIN); 
     181  } 
     182  else 
     183  { 
     184    die('Error! Failed to open JobTracker log for reading: ' . $jobtracker_log_path . "\n"); 
     185  } 
     186  # Increment the day by one - unfortunately this leads to unpleasant date 
     187  # maths to ensure month and year roll over appropriately too 
     188  $log_date_day++; 
     189  # On to a new month? 
     190  # Leap Year Feb > 29 otherwise Feb > 28 
     191  # Apr, Jun, Sep, Nov > 30 
     192  # Rest > 31 
     193  if ($log_date_month == 2 && ((&isLeapYear($log_date_year) && $log_date_day > 29) || $log_date_day > 28)) 
     194  { 
     195    $log_date_day = 1; 
     196    $log_date_month++; 
     197  } 
     198  elsif (($log_date_month == 4 || $log_date_month == 6 || $log_date_month == 9 || $log_date_month == 11) && $log_date_day > 30) 
     199  { 
     200    $log_date_day = 1; 
     201    $log_date_month++; 
     202  } 
     203  elsif ($log_date_day > 31) 
     204  { 
     205    $log_date_day = 1; 
     206    $log_date_month++; 
     207  } 
     208  # On to a new year? 
     209  if ($log_date_month > 12) 
     210  { 
     211    $log_date_month = 1; 
     212    $log_date_year++; 
     213  } 
    158214} 
    159215print "Done\n"; 
    160216 
     217if (!$job_complete) 
     218{ 
     219  print "Warning! Failed to parse in information for a complete job.\n"; 
     220} 
    161221 
    162222# 4. Write CSV of information 
    163223print " * Writing Job Information... "; 
    164 &debugPrint("\nAttemptID\tComputeNode\tSucceeded"); 
     224&debugPrint("AttemptID                   \tComputeNode\tSucceeded", 1); 
    165225foreach my $attempt_id (keys %{$aid_2_node}) 
    166226{ 
    167227  &debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'}); 
    168228} 
    169 &debugPrint("TaskID\tComputeNodeSplits"); 
     229&debugPrint("TaskID                    \tComputeNodeSplits"); 
     230my $split_counter = 0; 
    170231foreach my $task_id (keys %{$tid_2_splits}) 
    171232{ 
    172233  &debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}}))); 
    173 } 
     234  $split_counter++; 
     235} 
     236&debugPrint(' * Number of split records: ' . $split_counter); 
     237&debugPrint(''); 
    174238 
    175239# - open the CSV file and write out the combined information from above 
     
    189253    } 
    190254    print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . ",\"" . $compute_node . "\",\"" . join(',', natsort(@splits)) . "\"\n"; 
    191   } 
     255    undef($tid_2_splits->{$task_id}); 
     256  } 
     257 
     258  # Report on any other splits that were recorded in the log, but for unknown 
     259  # reasons aren't matched with a 'successful' task 
     260  foreach my $task_id (keys %{$tid_2_splits}) 
     261  { 
     262    my ($job_id, $task_number) = $task_id =~ /^(\d+_\d+)_m_(\d+)/; 
     263    my @splits = @{$tid_2_splits->{$task_id}}; 
     264    print CSVOUT $task_number . ",-1,-1,\"\",\"" . join(',', natsort(@splits)) . "\"\n"; 
     265  } 
     266 
    192267  close(CSVOUT); 
    193268} 
     
    206281sub debugPrint 
    207282{ 
    208   my ($msg) = @_; 
     283  my ($msg, $force_newline) = @_; 
    209284  if ($debug) 
    210285  { 
    211     print '[debug] ' . $msg . "\n"; 
     286    if (defined $force_newline && $force_newline) 
     287    { 
     288      print "\n[debug] " . $msg . "\n"; 
     289    } 
     290    else 
     291    { 
     292      print '[debug] ' . $msg . "\n"; 
     293    } 
    212294  } 
    213295} 
     
    219301  return $path; 
    220302} 
     303 
     304## @function isLeapYear() 
     305# 
     306sub isLeapYear 
     307{ 
     308  my ($year) = @_; 
     309  if ($year % 400 == 0) 
     310  { 
     311    return 1; 
     312  } 
     313  elsif ($year % 100 == 0) 
     314  { 
     315    return 0; 
     316  } 
     317  elsif ($year % 4 == 0) 
     318  { 
     319    return 1; 
     320  } 
     321  return 0; 
     322} 
     323## isLeapYear() ## 
     324 
     3251;