Changeset 28665


Ignore:
Timestamp:
11/21/13 13:35:52 (7 years ago)
Author:
jmt12
Message:

Latest changes to workaround resumed syscalls massive duration problem

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl

    r28646 r28665  
    2424# strace_analyzer.pl would use
    2525print " * Initializing... Done\n";
     26our $strace_flags = '-f -q -s 256 -T -ttt';
    2627our $io_function_list = {
    2728                         'access'=>1,
     
    5859                         'write'=>1
    5960                        };
    60 
    6161my $start_time = [gettimeofday()];
    6262my $collection = '';
    6363my $debug = 0;
    6464my $flush_delay = 3;
     65
    6566GetOptions ('collection=s' => \$collection,
    6667            'debug' => \$debug,
    6768            'flushdelay=i' => \$flush_delay)
    6869or &printUsage('Invalid or missing argument');
     70
    6971if (!defined $collection || $collection eq '')
    7072{
     
    8890print " * Running baseline collection import... ";
    8991my $dummy_dir = tempdir( CLEANUP => 1);
    90 my $command1 = 'time -p strace -f -q -T -ttt -s 4096 import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1 1>/dev/null';
     92my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1';
    9193my $debug_path1 = '';
    9294if ($debug)
    9395{
    94   $debug_path1 = $collection_path . '/debug-dummy.csv';
    95 }
    96 my ($baseline_duration, $baseline_io, $baseline_cpu) = &parseStrace($command1, $collection_path, $debug_path1);
     96  $debug_path1 = $collection_path . '/debug-baseline.tsv';
     97}
     98my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1);
    9799print "Done\n";
    98100print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n";
    99 print '   - I/O Duration:  ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
     101print '   - SysCall Time:  ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n";
     102print '   - SysCall Prct:  ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n";
     103print '   - I/O Time:      ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
    100104print '   - I/O Percent:   ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n";
    101105if ($debug)
    102106{
    103   print "   - See 'debug-dummy.csv' for raw data\n";
     107  print "   - See 'debug-baseline.tsv' for raw data\n";
    104108}
    105109print "\n";
    106 exit;
    107110
    108111# 3. Run normal import command within strace reading output line by line
     
    110113&flushDiskCache($flush_delay);
    111114print " * Running import and tracking I/O metrics... ";
    112 my $command2 = 'strace -f -q -T -ttt -s 256 import.pl -removeold "' . $collection . '" 2>&1';
     115my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1';
    113116my $debug_path2 = '';
    114117if ($debug)
    115118{
    116   $debug_path2 = $collection_path . '/debug-import.csv';
    117 }
    118 my ($duration_import, $duration_io, $duration_cpu, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
     119  $debug_path2 = $collection_path . '/debug-import.tsv';
     120}
     121my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
    119122print "Done\n";
    120123print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n";
     
    122125print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
    123126print '   - Duration:      ' . sprintf('%0.6f', $duration_import) . " seconds\n";
     127print '   - SysCall Time:  ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n";
     128print '   - SysCall Prct:  ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n";
    124129print '   - I/O Duration:  ' . sprintf('%0.6f', $duration_io) . " seconds\n";
    125130print '   - I/O Percent:   ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n";
     
    128133if ($debug)
    129134{
    130   print "   - See 'debug-import.csv' for raw data\n";
     135  print "   - See 'debug-import.tsv' for raw data\n";
    131136}
    132137print "\n";
     
    237242  my $end_timestamp = 0;
    238243  my $io_duration = 0;
    239   my $cpu_duration = 0;
     244  my $misc_duration = 0;
    240245  my $import_complete = 0;
    241246  my $documents_found = 0;
     
    258263  if ($debug_path)
    259264  {
    260     open(DEBUGOUT, '>:utf8', $debug_path) or die('Error! Failed to open file for writing: '  . $logs_path . '/' . $debug);
    261     print DEBUGOUT "TIMESTAMP,PID,SYSCALL,DURATION,TOTALIO,TOTALCPU,ARGS,RESULT\n";
     265    open(TSVOUT, '>:utf8', $debug_path) or die('Error! Failed to open file for writing: '  . $logs_path . '/' . $debug);
     266    print TSVOUT "TIMESTAMP\tPID\tSYSCALL\tELAPSED\tTOTALIO\tTOTALMISC\tARGS\tRESULT\n";
    262267  }
    263268
     
    265270  {
    266271    my $pid = 0;
    267     my $line;
    268     while ($line = <GSI>)
     272    my $line = <GSI>;
     273    while ($line)
    269274    {
     275      my $next_line = <GSI>;
    270276      if ($logging_enabled)
    271277      {
     
    277283      while ($line !~ /<unfinished \.\.\.>/ && $line !~ /SIG/ && $line =~ /\(/ && $line !~ /\)\s+=/)
    278284      {
    279         my $new_line = '';
    280         if ($new_line = <GSI>)
    281         {
    282           $line .= ' ' . $new_line;
     285        $line .= ' ' . $next_line;
     286        if ($next_line = <GSI>)
     287        {
    283288        }
    284289        else
     
    289294      }
    290295      # detect and remove any PID information to make the rest of parsing easier
    291       if ($line =~ /^\[pid (\d+)\]\s+(.*)$/s)
     296      if ($line =~ /^\[pid\s+(\d+)\]\s+(.*)$/s)
    292297      {
    293298        my $new_pid = $1;
     
    325330          }
    326331        }
    327         $line = $timestamp . ' ' . $syscall . '(' . $args_prefix . ' ' . $args_suffix . ') = ' . $result . ' <' . $duration . '>';
     332
     333        # Time for a hack - duration is highly dubious for suspended
     334        # syscalls. It typically includes the total walltime spent doing
     335        # anything between when the 'unfinished' and the 'resumed'... time that
     336        # is already accounted for in other parsed syscalls.  What I'll do is
     337        # limit the resumed calls duration to, at most, the time difference
     338        # between this syscall and the next. This is highly inaccurate, of
     339        # course, as it includes time spent in userspace but is significantly
     340        # better than a elasped duration several times longer than the syscall
     341        # actually took.
     342        if ($next_line =~ /^.*?(\d+\.\d+)/)
     343        {
     344          my $next_timestamp = $1;
     345          my $timestamp_duration = $next_timestamp - $timestamp;
     346          if ($duration > $timestamp_duration)
     347          {
     348            #rint "[resume: elapsed longer than duration... fudging] ";
     349            $duration = $timestamp_duration;
     350          }
     351        }
     352
     353        $line = $timestamp . ' ' . $syscall . '(' . $args_prefix . ' ' . $args_suffix . ') = ' . $result . ' <' . sprintf('%0.6f', $duration) . '>';
    328354      }
    329355
     
    346372      }
    347373      # - processes announce detachments and resumptions - maybe this explains
    348       # why wait4's keep chaning
     374      # why wait4's keep chaning (-q should shut these up)
    349375      elsif ($line =~ /^Process (\d+) (detached|resumed)/)
    350376      {
     
    353379      }
    354380      # - exit_group never has a duration
    355       elsif ($line =~ /^(\d+\.\d+)\s+exit_group\(0\)\s+=\s+\?$/)
     381      elsif ($line =~ /^(\d+\.\d+)\s+exit_group\((\d+)\)\s+=\s+\?$/)
    356382      {
    357383        my $timestamp = $1;
     384        my $exit_value = $2;
    358385        $end_timestamp = $timestamp;
    359386        if ($debug)
    360387        {
    361           print DEBUGOUT $timestamp . ',' . $pid . ',"exit_group",0,' . $io_duration . ',' . $cpu_duration . ",\"0\",\"\"\n";
     388          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t";
     389          print TSVOUT $pid . "\t";
     390          print TSVOUT "exit_group\t";
     391          print TSVOUT "0.000000\t";
     392          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t";
     393          print TSVOUT sprintf("%0.6f", $io_duration) . "\t";
     394          print TSVOUT $exit_value . "\t";
     395          print TSVOUT "?\n";
    362396        }
    363397      }
     
    389423        else
    390424        {
    391           $cpu_duration += $duration;
     425          $misc_duration += $duration;
    392426          ##print "[$syscall]";
    393427        }
     
    417451        if ($debug)
    418452        {
    419           $args =~ s/\"/&quot;/g;
    420453          $args =~ s/\r?\n//g;
    421           print DEBUGOUT $timestamp . ',' . $pid . ',"' . $syscall . '",' . $duration . ',' . $io_duration . ',' . $cpu_duration . ',"' . $args . '","' . $result . "\"\n";
     454          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t";
     455          print TSVOUT $pid . "\t";
     456          print TSVOUT $syscall . "\t";
     457          print TSVOUT sprintf("%0.6f", $duration) . "\t";
     458          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t";
     459          print TSVOUT sprintf("%0.6f", $io_duration) . "\t";
     460          print TSVOUT $args . "\t";
     461          print TSVOUT $result . "\n";
    422462        }
    423463      }
     
    427467        print "\n\nUnparsed: |" . $line . "|\n\n";
    428468      }
     469      # Move on to the next line... (may be undef)
     470      $line = $next_line;
    429471    }
    430472  }
     
    440482  if ($debug_path)
    441483  {
    442     close(DEBUGOUT);
     484    close(TSVOUT);
    443485  }
    444486  my $cmd_duration = $end_timestamp - $start_timestamp;
    445   return ($cmd_duration, $io_duration, $cpu_duration, $import_complete, $documents_found, $documents_processed);
     487  return ($cmd_duration, $io_duration, $misc_duration, $import_complete, $documents_found, $documents_processed);
    446488}
    447489# parseStrace()
Note: See TracChangeset for help on using the changeset viewer.