Ignore:
Timestamp:
2013-06-10T14:12:28+12:00 (11 years ago)
Author:
jmt12
Message:

Extend parser to support jobs that are split over several logs. Also ensure that split information is printed to CSV even for those tasks that appear to have failed (or completely lack attempt information).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/parse_task_info_from_hadoop_log.pl

    r27587 r27588  
    7777print $job_id . "\n";
    7878# - we'll need the date to locate the appopriate log file
    79 my $log_date_suffix = '';
     79my $log_date_year = 0;
     80my $log_date_month = 0;
     81my $log_date_day = 0;
    8082if ($job_id =~ /^(\d\d\d\d)(\d\d)(\d\d)/)
    8183{
    82   $log_date_suffix = '.' . $1 . '-' . $2 . '-' . $3;
    83 }
    84 
    85 # 2. Determine appropriate job tracker log
     84  $log_date_year = $1;
     85  $log_date_month = $2;
     86  $log_date_day = $3;
     87}
     88else
     89{
     90  die('Error! Failed to parse date from Job ID: ' . $job_id . "\n");
     91}
     92
     93# 2. Determine user and system details
    8694my $username = `whoami`;
    8795chomp($username);
     
    9098chomp($hostname);
    9199print " Hostname: " . $hostname . "\n";
    92 my $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log' . $log_date_suffix);
    93 if (!-e $jobtracker_log_path)
    94 {
    95   $jobtracker_log_path = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
    96   if (!-e $jobtracker_log_path)
    97   {
    98     die("Error! Hadoop JobTracker log file cannot be found: " . $jobtracker_log_path . "\n");
    99   }
    100 }
    101 print " Jobtracker log path: " . $jobtracker_log_path . "\n";
    102100my $data_locality_report_path = &fileCat($results_dir, 'data_locality.csv');
    103101print " Report path: " . $data_locality_report_path . "\n";
    104102
    105103# 3. Parse log
    106 print " * Parse JobTracker Log... ";
     104print " * Parse JobTracker Log(s)...\n";
    107105my $tid_2_splits = {};
    108106my $tid_2_node = {};
    109107my $aid_2_node = {};
    110 if (open(JTLIN, '<', $jobtracker_log_path))
    111 {
    112   my $line = '';
    113   while ($line = <JTLIN>)
    114   {
    115     # Tips provide a match between task and file splits
    116     if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/)
    117     {
    118       my $task_id = $job_id . $1;
    119       my $compute_node = $2;
    120       &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
    121       if (!defined $tid_2_splits->{$task_id})
     108my $job_complete = 0;
     109my $parsed_latest_log = 0;
     110while (!$job_complete && !$parsed_latest_log)
     111{
     112  # - determine appropriate job tracker log
     113  my $jobtracker_log_path_prefix = &fileCat($ENV{'HADOOP_PREFIX'}, 'logs', 'hadoop-' . $username . '-jobtracker-' . $hostname . '.log');
     114  my $jobtracker_log_path = sprintf('%s.%04d-%02d-%02d', $jobtracker_log_path_prefix, $log_date_year, $log_date_month, $log_date_day);
     115  # - maybe the log hasn't been rolled yet
     116  if (!-e $jobtracker_log_path)
     117  {
     118    $jobtracker_log_path = $jobtracker_log_path_prefix;
     119    # - nope, no applicable log found
     120    if (!-e $jobtracker_log_path)
     121    {
     122      die('Error! Hadoop JobTracker log file cannot be found: ' . $jobtracker_log_path . "\n");
     123    }
     124    else
     125    {
     126      $parsed_latest_log = 1;
     127    }
     128  }
     129  print " - parsing JobTracker log: " . $jobtracker_log_path . "\n";
     130
     131  if (open(JTLIN, '<', $jobtracker_log_path))
     132  {
     133    my $line = '';
     134    while ($line = <JTLIN>)
     135    {
     136      # Tips provide a match between task and file splits
     137      if ($line =~ /tip:task_${job_id}(_m_\d+) has split on node:\/default-rack\/([^\.\r\n]+)/)
    122138      {
    123         $tid_2_splits->{$task_id} = [$compute_node];
     139        my $task_id = $job_id . $1;
     140        my $compute_node = $2;
     141        &debugPrint('found tip: ' . $task_id . ' => ' . $compute_node);
     142        if (!defined $tid_2_splits->{$task_id})
     143        {
     144          $tid_2_splits->{$task_id} = [$compute_node];
     145        }
     146        else
     147        {
     148          push(@{$tid_2_splits->{$task_id}}, $compute_node);
     149        }
    124150      }
    125       else
     151      # JobTracker (MAP) entries give us a mapping between task, attempt, and
     152      # compute node
     153      elsif ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
    126154      {
    127         push(@{$tid_2_splits->{$task_id}}, $compute_node);
     155        my $task_id = $job_id . $1;
     156        my $attempt_id = $job_id . $1 . $2;
     157        my $compute_node = $3;
     158        &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
     159        $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
     160                                      'succeeded' => 0
     161                                     };
    128162      }
    129     }
    130     # JobTracker (MAP) entries give us a mapping between task, attempt, and
    131     # compute node
    132     if ($line =~ /Adding task \(MAP\) 'attempt_${job_id}(_m_\d+)(_\d+)'.*tracker_([^\.]+).local/)
    133     {
    134       my $task_id = $job_id . $1;
    135       my $attempt_id = $job_id . $1 . $2;
    136       my $compute_node = $3;
    137       &debugPrint('found MAP: ' . $attempt_id . ' => ' . $compute_node);
    138       $aid_2_node->{$attempt_id} = {'compute_node' => $compute_node,
    139                                     'succeeded' => 0
    140                                    };
    141     }
    142     # Watch for attempt successes (so we can weed out failures)
    143     if ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
    144     {
    145       my $attempt_id = $job_id . $1;
    146       &debugPrint('successful attempt: ' . $attempt_id);
    147       if (defined $aid_2_node->{$attempt_id})
     163      # Watch for attempt successes (so we can weed out failures)
     164      elsif ($line =~ /Task 'attempt_${job_id}(_m_\d+_\d+)' has completed .* successfully/)
    148165      {
    149         $aid_2_node->{$attempt_id}->{'succeeded'} = 1;
     166        my $attempt_id = $job_id . $1;
     167        &debugPrint('successful attempt: ' . $attempt_id);
     168        if (defined $aid_2_node->{$attempt_id})
     169        {
     170          $aid_2_node->{$attempt_id}->{'succeeded'} = 1;
     171        }
    150172      }
    151     }
    152   }
    153   close(JTLIN);
    154 }
    155 else
    156 {
    157   die("Error! Failed to open JobTracker log for reading: " . $jobtracker_log_path . "\n");
     173      # And job completion... so we can keep parsing other log files as
     174      # necessary
     175      elsif ($line =~ /Job 'job_${job_id} has completed successfully\./)
     176      {
     177        $job_complete = 1;
     178      }
     179    }
     180    close(JTLIN);
     181  }
     182  else
     183  {
     184    die('Error! Failed to open JobTracker log for reading: ' . $jobtracker_log_path . "\n");
     185  }
     186  # Increment the day by one - unfortunately this leads to unpleasant date
     187  # maths to ensure month and year roll over appropriately too
     188  $log_date_day++;
     189  # On to a new month?
     190  # Leap Year Feb > 29 otherwise Feb > 28
     191  # Apr, Jun, Sep, Nov > 30
     192  # Rest > 31
     193  if ($log_date_month == 2 && ((&isLeapYear($log_date_year) && $log_date_day > 29) || $log_date_day > 28))
     194  {
     195    $log_date_day = 1;
     196    $log_date_month++;
     197  }
     198  elsif (($log_date_month == 4 || $log_date_month == 6 || $log_date_month == 9 || $log_date_month == 11) && $log_date_day > 30)
     199  {
     200    $log_date_day = 1;
     201    $log_date_month++;
     202  }
     203  elsif ($log_date_day > 31)
     204  {
     205    $log_date_day = 1;
     206    $log_date_month++;
     207  }
     208  # On to a new year?
     209  if ($log_date_month > 12)
     210  {
     211    $log_date_month = 1;
     212    $log_date_year++;
     213  }
    158214}
    159215print "Done\n";
    160216
     217if (!$job_complete)
     218{
     219  print "Warning! Failed to parse in information for a complete job.\n";
     220}
    161221
    162222# 4. Write CSV of information
    163223print " * Writing Job Information... ";
    164 &debugPrint("\nAttemptID\tComputeNode\tSucceeded");
     224&debugPrint("AttemptID                   \tComputeNode\tSucceeded", 1);
    165225foreach my $attempt_id (keys %{$aid_2_node})
    166226{
    167227  &debugPrint($attempt_id . "\t" . $aid_2_node->{$attempt_id}->{'compute_node'} . "\t" . $aid_2_node->{$attempt_id}->{'succeeded'});
    168228}
    169 &debugPrint("TaskID\tComputeNodeSplits");
     229&debugPrint("TaskID                    \tComputeNodeSplits");
     230my $split_counter = 0;
    170231foreach my $task_id (keys %{$tid_2_splits})
    171232{
    172233  &debugPrint($task_id . "\t" . join(',', natsort(@{$tid_2_splits->{$task_id}})));
    173 }
     234  $split_counter++;
     235}
     236&debugPrint(' * Number of split records: ' . $split_counter);
     237&debugPrint('');
    174238
    175239# - open the CSV file and write out the combined information from above
     
    189253    }
    190254    print CSVOUT $task_number . "," . $attempt_number . "," . $data_local . ",\"" . $compute_node . "\",\"" . join(',', natsort(@splits)) . "\"\n";
    191   }
     255    undef($tid_2_splits->{$task_id});
     256  }
     257
     258  # Report on any other splits that were recorded in the log, but for unknown
     259  # reasons aren't matched with a 'successful' task
     260  foreach my $task_id (keys %{$tid_2_splits})
     261  {
     262    my ($job_id, $task_number) = $task_id =~ /^(\d+_\d+)_m_(\d+)/;
     263    my @splits = @{$tid_2_splits->{$task_id}};
     264    print CSVOUT $task_number . ",-1,-1,\"\",\"" . join(',', natsort(@splits)) . "\"\n";
     265  }
     266
    192267  close(CSVOUT);
    193268}
     
    206281sub debugPrint
    207282{
    208   my ($msg) = @_;
     283  my ($msg, $force_newline) = @_;
    209284  if ($debug)
    210285  {
    211     print '[debug] ' . $msg . "\n";
     286    if (defined $force_newline && $force_newline)
     287    {
     288      print "\n[debug] " . $msg . "\n";
     289    }
     290    else
     291    {
     292      print '[debug] ' . $msg . "\n";
     293    }
    212294  }
    213295}
     
    219301  return $path;
    220302}
     303
     304## @function isLeapYear()
     305#
     306sub isLeapYear
     307{
     308  my ($year) = @_;
     309  if ($year % 400 == 0)
     310  {
     311    return 1;
     312  }
     313  elsif ($year % 100 == 0)
     314  {
     315    return 0;
     316  }
     317  elsif ($year % 4 == 0)
     318  {
     319    return 1;
     320  }
     321  return 0;
     322}
     323## isLeapYear() ##
     324
     3251;
Note: See TracChangeset for help on using the changeset viewer.