Context Navigation

← Previous Change
Next Change →

hadoop_report.pl

Timestamp:

2013-11-20T12:57:27+13:00 (10 years ago)

Author:

jmt12

Message:

Changes to support running the reports over logs produced from multicore machines (Karearea) whereas I'd previously only supported clusters (Medusa). The biggest issue is that all workers claim to be 'W0' - so you need to differentiate by core rather than node

File:

: 1 edited

gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_report.pl (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_report.pl

-              r28356
+              r28652
+}
+use Cwd;
 use Devel::Peek;
 use Sort::Key::Natural qw(natsort);
 …
 # Configuration
+if (!defined $ARGV[0] || !-d $ARGV[0])
+my $offset = 0;
+my $dir_count = 0;
+while (defined $ARGV[$offset])
+{
+  my $argument = $ARGV[$offset];
+  if ($argument =~ /^\-(.*)/)
+  {
+  }
+  else
+  {
+    my $path = getcwd() . '/' . $argument;
+    if (-d $path)
+    {
+      &searchForHadoopLog($path);
+      $dir_count++;
+    }
+    $offset++;
+  }
+}
+if ($dir_count == 0)
+{
   die("usage: hadoop_report.pl <path to results>\n");
+}
+my $results_path = $ARGV[0];
+# Read in hadoop.log and parse top level record
+print ' * Reading and parsing "hadoop.log"... ';
+my $job_record = {'host'=>'', 'job'=>'', 'start'=>0, 'end'=>0, 'cpu_time'=>0};
+my $hadoop_log_path = $results_path . '/hadoop.log';
+if (open(HLIN, '<:utf8', $hadoop_log_path))
+{
+  while (my $line = <HLIN>)
+  {
+    if ($line =~ /host:(.+)/)
+    {
+      $job_record->{'host'} = $1;
+    }
+    elsif ($line =~ /Running job: job_(\d+_\d+)/)
+    {
+      $job_record->{'job'} = $1;
+    }
+    elsif ($line =~ /CPU time spent \(ms\)=(\d+)/)
+    {
+      $job_record->{'cpu_time'} = $1;
+    }
+    elsif ($job_record->{'start'} == 0 && $line =~ /(\d\d)\/(\d\d)\/(\d\d) (\d\d):(\d\d):(\d\d)/)
+    {
+      $job_record->{'start'} = timelocal($6, $5, $4, $3, ($2 - 1), $1);
+    }
+    elsif ($line =~ /(\d\d)\/(\d\d)\/(\d\d) (\d\d):(\d\d):(\d\d)/)
+    {
+      my $end = timelocal($6, $5, $4, $3, ($2 - 1), $1);
+      if ($end > $job_record->{'end'})
+      {
+        $job_record->{'end'} = $end;
+      }
+    }
+  }
+  close(HLIN);
+  if ($job_record->{'start'} == 0 || $job_record->{'end'} == 0)
+  {
+    die('Error! Failed to parse timing information from log: ' . $hadoop_log_path);
+  }
+}
+else
+{
+  die('Error! Failed to open file for reading: ' . $hadoop_log_path);
+}
+print "Done!\n";
+# Read in data_locality.csv (will be matched to task logs)
+my $data_was_local = {};
+my $data_locality_csv_path = $results_path . '/data_locality.csv';
+if (-f $data_locality_csv_path)
+{
+  print ' * Reading and parsing "data_locality.csv"... ';
+  if (open(DLIN, '<:utf8', $data_locality_csv_path))
+  {
+    while (my $line = <DLIN>)
+    {
+      # note that the line may begin with a taskid or just a taskno (legacy)
+      if ($line =~ /(\d+),\d,(\d)/)
+      {
+        $data_was_local->{$1} = $2;
+      }
+    }
+    close(DLIN);
+  }
+  else
+  {
+    die('Error! Failed to open file for reading: ' . $data_locality_csv_path);
+print "\n\n================================== Complete ====================================\n\n";
+exit;
+sub searchForHadoopLog
+{
+  my ($dir) = @_;
+  if (-d $dir)
+  {
+    print ' * Searching for hadoop.log: ' . $dir . "\n";
+    my $hadoop_log_path = $dir . '/hadoop.log';
+    if (-f $hadoop_log_path)
+    {
+      &generateReport($dir);
+    }
+    # search recursively
+    if (opendir(DH, $dir))
+    {
+      my @files = readdir(DH);
+      closedir(DH);
+      foreach my $file (@files)
+      {
+        my $path = $dir . '/' . $file;
+        # ignore files starting with .
+        if ($file =~ /^\./)
+        {
+        }
+        elsif (-d $path)
+        {
+          &searchForHadoopLog($path);
+        }
+      }
+    }
+    else
+    {
+      die('Error! Failed to open directory for reading: ' . $dir);
+    }
+  }
+}
+sub generateReport
+{
+  my ($results_path) = @_;
+  # Read in hadoop.log and parse top level record
+  print ' * Reading and parsing "hadoop.log"... ';
+  my $job_record = {'host'=>'', 'job'=>'', 'start'=>0, 'end'=>0, 'cpu_time'=>0};
+  my $hadoop_log_path = $results_path . '/hadoop.log';
+  if (open(HLIN, '<:utf8', $hadoop_log_path))
+  {
+    while (my $line = <HLIN>)
+    {
+      if ($line =~ /host:(.+)/)
+      {
+        $job_record->{'host'} = $1;
+      }
+      elsif ($line =~ /Running job: job_(\d+_\d+)/)
+      {
+        $job_record->{'job'} = $1;
+      }
+      elsif ($line =~ /CPU time spent \(ms\)=(\d+)/)
+      {
+        $job_record->{'cpu_time'} = $1;
+      }
+      elsif ($job_record->{'start'} == 0 && $line =~ /(\d\d)\/(\d\d)\/(\d\d) (\d\d):(\d\d):(\d\d)/)
+      {
+        $job_record->{'start'} = timelocal($6, $5, $4, $3, ($2 - 1), $1);
+      }
+      elsif ($line =~ /(\d\d)\/(\d\d)\/(\d\d) (\d\d):(\d\d):(\d\d)/)
+      {
+        my $end = timelocal($6, $5, $4, $3, ($2 - 1), $1);
+        if ($end > $job_record->{'end'})
+        {
+          $job_record->{'end'} = $end;
+        }
+      }
+    }
+    close(HLIN);
+    if ($job_record->{'start'} == 0 || $job_record->{'end'} == 0)
+    {
+      die('Error! Failed to parse timing information from log: ' . $hadoop_log_path);
+    }
+  }
+  else
+  {
+    die('Error! Failed to open file for reading: ' . $hadoop_log_path);
+  }
   print "Done!\n";
+}
+else
+{
+  print " * Data locality not available or not applicable\n";
+}
+# Read in all task logs and parse task records
+my $task_records;
+print " * Locating task logs...\n";
+if (opendir(DH, $results_path))
+{
+  my @files = readdir(DH);
+  foreach my $file (sort @files)
+  {
+    if ($file =~ /import-hadoop-(\d+_\d+)_m_(\d+)_\d+\.log/)
+    {
+      my $job_no = $1;
+      my $task_no = $2;
+      my $is_data_local = 0;
+      if (defined ($data_was_local->{$task_no}))
+      {
+        $is_data_local = $data_was_local->{$task_no};
+      }
+      my $task_record = {'host'=>'', 'cpu'=>0, 'job' => $job_no, 'task' => $task_no, 'start'=>0, 'end'=>0, 'cpu_time'=>0, 'data_locality'=>$is_data_local, 'file'=>'', 'percom'=>'NA'};
+      print ' - Reading and parsing "' . $file . '"... ';
+      my $task_log_path = $results_path . '/' . $file;
+      my $io_time = 0;
+      if (open(TIN, '<:utf8', $task_log_path))
+      {
+        my $io_start_time = 0;
+        while (my $line = <TIN>)
+        {
+          if ($line =~ /\[Started:(\d+(?:\.\d+)?)\]/)
+  # Read in data_locality.csv (will be matched to task logs)
+  my $data_was_local = {};
+  my $data_locality_csv_path = $results_path . '/data_locality.csv';
+  if (-f $data_locality_csv_path)
+  {
+    print ' * Reading and parsing "data_locality.csv"... ';
+    if (open(DLIN, '<:utf8', $data_locality_csv_path))
+    {
+      while (my $line = <DLIN>)
+      {
+        # note that the line may begin with a taskid or just a taskno (legacy)
+        if ($line =~ /(\d+),\d,(\d)/)
+        {
+          $data_was_local->{$1} = $2;
+        }
+      }
+      close(DLIN);
+    }
+    else
+    {
+      die('Error! Failed to open file for reading: ' . $data_locality_csv_path);
+    }
+    print "Done!\n";
+  }
+  else
+  {
+    print " * Data locality not available or not applicable\n";
+  }
+  # Read in all task logs and parse task records
+  my $task_records;
+  print " * Locating task logs...\n";
+  if (opendir(DH, $results_path))
+  {
+    my @files = readdir(DH);
+    foreach my $file (sort @files)
+    {
+      if ($file =~ /import-hadoop-(\d+_\d+)_m_(\d+)_\d+\.log/)
+      {
+        my $job_no = $1;
+        my $task_no = $2;
+        my $is_data_local = 0;
+        if (defined ($data_was_local->{$task_no}))
+        {
+          $is_data_local = $data_was_local->{$task_no};
+        }
+        my $task_record = {'host'=>'', 'cpu'=>0, 'job' => $job_no, 'task' => $task_no, 'start'=>0, 'end'=>0, 'cpu_time'=>0, 'data_locality'=>$is_data_local, 'file'=>'', 'percom'=>'NA'};
+        print ' - Reading and parsing "' . $file . '"... ';
+        my $task_log_path = $results_path . '/' . $file;
+        my $io_time = 0;
+        if (open(TIN, '<:utf8', $task_log_path))
+        {
+          my $io_start_time = 0;
+          while (my $line = <TIN>)
+          {
+            $task_record->{'start'} = $1;
+            if ($line =~ /\[Started:(\d+(?:\.\d+)?)\]/)
+            {
+              $task_record->{'start'} = $1;
+            }
+            elsif ($line =~ /\[Host:([^\]]+)\]/)
+            {
+              $task_record->{'host'} = $1;
+            }
+            elsif ($line =~ /\[CPU:(\d+)\]/)
+            {
+              $task_record->{'cpu'} = $1;
+            }
+            elsif ($line =~ /\[Map:([^\>]+)=>/)
+            {
+              $task_record->{'file'} = $1;
+            }
+            elsif ($line =~ /\[IOS:(\d+)\]/)
+            {
+              $io_start_time = $1;
+            }
+            elsif ($io_start_time > 0 && $line =~ /\[IOE:(\d+)\]/)
+            {
+              $io_time += ($1 - $io_start_time);
+              $io_start_time = 0;
+            }
+            elsif ($line =~ /\[Completed:(\d+(?:\.\d+)?)\]/)
+            {
+              my $end_time = $1;
+              $task_record->{'end'} = $end_time;
+              if ($io_start_time > 0)
+              {
+                $io_time += ($end_time - $io_start_time);
+              }
+            }
+          }
+          elsif ($line =~ /\[Host:([^\]]+)\]/)
+          close(TIN);
+          # Calculate CPU time (total time - IO time)
+          $task_record->{'cpu_time'} = $task_record->{'end'} - $task_record->{'start'} - $io_time;
+          # We should now have the filename - use this and try and locate a
+          # convert log for this item (assuming it is multimedia, which it may
+          # not be)
+          if (defined $task_record->{'file'} && $task_record->{'file'} =~ /\/([^\/]+)\.ts/)
+          {
+            $task_record->{'host'} = $1;
+          }
+          elsif ($line =~ /\[CPU:(\d+)\]/)
+          {
+            $task_record->{'cpu'} = $1;
+          }
+          elsif ($line =~ /\[Map:([^\>]+)=>/)
+          {
+            $task_record->{'file'} = $1;
+          }
+          elsif ($line =~ /\[IOS:(\d+)\]/)
+          {
+            $io_start_time = $1;
+          }
+          elsif ($io_start_time > 0 && $line =~ /\[IOE:(\d+)\]/)
+          {
+            $io_time += ($1 - $io_start_time);
+            $io_start_time = 0;
+          }
+          elsif ($line =~ /\[Completed:(\d+(?:\.\d+)?)\]/)
+          {
+            my $end_time = $1;
+            $task_record->{'end'} = $end_time;
+            if ($io_start_time > 0)
+            {
+              $io_time += ($end_time - $io_start_time);
+            }
+          }
+        }
+        close(TIN);
+        # Calculate CPU time (total time - IO time)
+        $task_record->{'cpu_time'} = $task_record->{'end'} - $task_record->{'start'} - $io_time;
+        # We should now have the filename - use this and try and locate a
+        # convert log for this item (assuming it is multimedia, which it may
+        # not be)
+        if (defined $task_record->{'file'} && $task_record->{'file'} =~ /\/([^\/]+)\.ts/)
+        {
+          my $filename_sans_extension = $1;
+          my $convert_log = $results_path . '/convert-' . $filename_sans_extension . '.log';
+          if (-f $convert_log)
+          {
+            print '[Reading and parsing convert log]... ';
+            if (open(CLIN, '<:utf8', $convert_log))
+            {
+              my $max_percent = 0.00;
+              while (my $line = <CLIN>)
+            my $filename_sans_extension = $1;
+            my $convert_log = $results_path . '/convert-' . $filename_sans_extension . '.log';
+            if (-f $convert_log)
+            {
+              print '[Reading and parsing convert log]... ';
+              if (open(CLIN, '<:utf8', $convert_log))
+              {
+                if ($line =~ /.*Encoding: task 1 of 1, (\d+\.\d\d) \%/)
+                my $max_percent = 0.00;
+                while (my $line = <CLIN>)
+                {
+                  my $percent = $1;
+                  if ($percent > $max_percent)
+                  if ($line =~ /.*Encoding: task 1 of 1, (\d+\.\d\d) \%/)
+                  {
+                    $max_percent = $percent;
+                    my $percent = $1;
+                    if ($percent > $max_percent)
+                    {
+                      $max_percent = $percent;
+                    }
+                  }
+                }
+                close(CLIN);
+                $task_record->{'percom'} = $max_percent;
+              }
+              close(CLIN);
+              $task_record->{'percom'} = $max_percent;
+            }
+            else
+            {
+              print STDERR "Warning! Failed to open log file for reading: " . $convert_log . "\n";
+              else
+              {
+                print STDERR "Warning! Failed to open log file for reading: " . $convert_log . "\n";
+              }
+            }
+          }
+        }
+        # Store this record
+        $task_records->{$task_no} = $task_record;
+      }
+      else
+      {
+        die('Error! Failed to open file for reading: ' . $task_log_path);
+      }
+      print "Done!\n";
+    }
+  }
+  close(DH);
+}
+else
+{
+  die('Error! Failed to open directory for reading: ' . $results_path);
+}
+print ' - Processed ' . scalar(keys(%{$task_records})) . " records\n";
+# Generate compute-node records
+print ' * Generating compute node records... ';
+my $node_records;
+foreach my $taskno (sort keys %{$task_records})
+{
+  my $task_record = $task_records->{$taskno};
+  my $node_record = {'host'=>'', 'cpu'=>0, 'job' => '', 'start'=>0, 'end'=>0, 'cpu_time'=>0};
+  # - retrieve any existing record
+  my $worker_id = $task_record->{'host'} . '#' . $task_record->{'cpu'};
+  if (defined $node_records->{$worker_id})
+  {
+    $node_record = $node_records->{$worker_id};
+  }
+  if ($node_record->{'host'} eq '')
+  {
+    $node_record->{'host'} = $task_record->{'host'};
+  }
+  if ($node_record->{'cpu'} == 0)
+  {
+    $node_record->{'cpu'} = $task_record->{'cpu'};
+  }
+  if ($node_record->{'job'} eq '')
+  {
+    $node_record->{'job'} = $task_record->{'job'};
+  }
+  if ($node_record->{'start'} == 0 || $task_record->{'start'} < $node_record->{'start'})
+  {
+    $node_record->{'start'} = $task_record->{'start'};
+  }
+  if ($node_record->{'end'} == 0 || $task_record->{'end'} > $node_record->{'end'})
+  {
+    $node_record->{'end'} = $task_record->{'end'};
+  }
+  $node_record->{'cpu_time'} += $task_record->{'cpu_time'};
+  # - store it
+  $node_records->{$worker_id} = $node_record;
+}
+print "Done!\n";
+# Write out CSV of all information
+my $report_csv_path = $results_path . '/timing.csv';
+if (open(CSVOUT, '>:utf8', $report_csv_path))
+{
+  my $row_counter = 1;
+  # Header
+  print CSVOUT "id,name,hostname,start,end,cputime,dl,pid,filename,percom\n";
+  # Master Record
+  print CSVOUT $row_counter . ',M0,' . $job_record->{'host'} . ',' . $job_record->{'start'} . ',' . $job_record->{'end'} . ',' . ($job_record->{'cpu_time'} / 1000) . ",0,0,NA,NA\n";
+  $row_counter++;
+  # For each compute node record
+  my $known_workers = {};
+  foreach my $worker_id (natsort(keys(%{$node_records})))
+  {
+    my $node_record = $node_records->{$worker_id};
+    my $node_id = $row_counter;
+          # Store this record
+          $task_records->{$task_no} = $task_record;
+        }
+        else
+        {
+          die('Error! Failed to open file for reading: ' . $task_log_path);
+        }
+        print "Done!\n";
+      }
+    }
+    close(DH);
+  }
+  else
+  {
+    die('Error! Failed to open directory for reading: ' . $results_path);
+  }
+  print ' - Processed ' . scalar(keys(%{$task_records})) . " records\n";
+  # Generate compute-node records
+  print ' * Generating compute node records... ';
+  my $node_records;
+  foreach my $taskno (sort keys %{$task_records})
+  {
+    my $task_record = $task_records->{$taskno};
+    my $node_record = {'host'=>'', 'cpu'=>0, 'job' => '', 'start'=>0, 'end'=>0, 'cpu_time'=>0};
+    # - retrieve any existing record
+    my $worker_id = $task_record->{'host'} . '#' . $task_record->{'cpu'};
+    if (defined $node_records->{$worker_id})
+    {
+      $node_record = $node_records->{$worker_id};
+    }
+    if ($node_record->{'host'} eq '')
+    {
+      $node_record->{'host'} = $task_record->{'host'};
+    }
+    if ($node_record->{'cpu'} == 0)
+    {
+      $node_record->{'cpu'} = $task_record->{'cpu'};
+    }
+    if ($node_record->{'job'} eq '')
+    {
+      $node_record->{'job'} = $task_record->{'job'};
+    }
+    if ($node_record->{'start'} == 0 || $task_record->{'start'} < $node_record->{'start'})
+    {
+      $node_record->{'start'} = $task_record->{'start'};
+    }
+    if ($node_record->{'end'} == 0 || $task_record->{'end'} > $node_record->{'end'})
+    {
+      $node_record->{'end'} = $task_record->{'end'};
+    }
+    $node_record->{'cpu_time'} += $task_record->{'cpu_time'};
+    # - store it
+    $node_records->{$worker_id} = $node_record;
+  }
+  print "Done!\n";
+  # Write out CSV of all information
+  my $report_csv_path = $results_path . '/timing.csv';
+  if (open(CSVOUT, '>:utf8', $report_csv_path))
+  {
+    my $row_counter = 1;
+    # Header
+    print CSVOUT "id,name,hostname,start,end,cputime,dl,pid,filename,percom\n";
+    # Master Record
+    print CSVOUT $row_counter . ',M0,' . $job_record->{'host'} . ',' . $job_record->{'start'} . ',' . $job_record->{'end'} . ',' . ($job_record->{'cpu_time'} / 1000) . ",0,0,NA,NA\n";
     $row_counter++;
+    my $csv_worker_id = 'W' . $node_record->{'cpu'};
+    # Ensure we haven't used this id before - this should never trigger for
+    # multicore CPUs, but will for clusters (as nearly all nodes will report
+    # themselves as 'W0')
+    if (defined $known_workers->{$csv_worker_id})
+    {
+      # Find a different worker id as this one is already in use
+      my $counter = 0;
+      $csv_worker_id = 'W' . $counter;
+      while (defined $known_workers->{$csv_worker_id})
+      {
+        $counter++;
+    # For each compute node record
+    my $known_workers = {};
+    foreach my $worker_id (natsort(keys(%{$node_records})))
+    {
+      my $node_record = $node_records->{$worker_id};
+      my $node_id = $row_counter;
+      $row_counter++;
+      my $csv_worker_id = 'W' . $node_record->{'cpu'};
+      # Ensure we haven't used this id before - this should never trigger for
+      # multicore CPUs, but will for clusters (as nearly all nodes will report
+      # themselves as 'W0')
+      if (defined $known_workers->{$csv_worker_id})
+      {
+        # Find a different worker id as this one is already in use
+        my $counter = 0;
         $csv_worker_id = 'W' . $counter;
+      }
+    }
+    $known_workers->{$csv_worker_id} = 1;
+    print CSVOUT $node_id . ',' . $csv_worker_id . ',' . $node_record->{'host'} . ',' . $node_record->{'start'} . ',' . $node_record->{'end'} . ',' . $node_record->{'cpu_time'} . ",0,1,NA,NA\n";
+    # List the child task records
+    foreach my $taskno (sort keys %{$task_records})
+    {
+      my $task_record = $task_records->{$taskno};
+      if ($task_record->{'host'} . '#' . $task_record->{'cpu'} eq $worker_id)
+      {
+        print CSVOUT $row_counter . ',T' . ($task_record->{'task'} + 0) . ',' . $task_record->{'host'} . ',' . $task_record->{'start'} . ',' . $task_record->{'end'} . ',' . $task_record->{'cpu_time'} . ',' . $task_record->{'data_locality'} . ',' . $node_id . ',' . $task_record->{'file'} . ',' . $task_record->{'percom'} . "\n";
+        $row_counter++;
+      }
+    }
+  }
+  close(CSVOUT);
+}
+else
+{
+  die('Error! Failed to open file for writing: ' . $report_csv_path);
+}
+print "Complete!\n\n";
+exit;
+        while (defined $known_workers->{$csv_worker_id})
+        {
+          $counter++;
+          $csv_worker_id = 'W' . $counter;
+        }
+      }
+      $known_workers->{$csv_worker_id} = 1;
+      print CSVOUT $node_id . ',' . $csv_worker_id . ',' . $node_record->{'host'} . ',' . $node_record->{'start'} . ',' . $node_record->{'end'} . ',' . $node_record->{'cpu_time'} . ",0,1,NA,NA\n";
+      # List the child task records
+      foreach my $taskno (sort keys %{$task_records})
+      {
+        my $task_record = $task_records->{$taskno};
+        if ($task_record->{'host'} . '#' . $task_record->{'cpu'} eq $worker_id)
+        {
+          print CSVOUT $row_counter . ',T' . ($task_record->{'task'} + 0) . ',' . $task_record->{'host'} . ',' . $task_record->{'start'} . ',' . $task_record->{'end'} . ',' . $task_record->{'cpu_time'} . ',' . $task_record->{'data_locality'} . ',' . $node_id . ',' . $task_record->{'file'} . ',' . $task_record->{'percom'} . "\n";
+          $row_counter++;
+        }
+      }
+    }
+    close(CSVOUT);
+  }
+  else
+  {
+    die('Error! Failed to open file for writing: ' . $report_csv_path);
+  }
+}
+;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 28652 for gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_report.pl

Legend:

gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_report.pl

Download in other formats: