Changeset 28665

Show
Ignore:
Timestamp:
21.11.2013 13:35:52 (6 years ago)
Author:
jmt12
Message:

Latest changes to workaround resumed syscalls massive duration problem

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl

    r28646 r28665  
    2424# strace_analyzer.pl would use 
    2525print " * Initializing... Done\n"; 
     26our $strace_flags = '-f -q -s 256 -T -ttt'; 
    2627our $io_function_list = { 
    2728                         'access'=>1, 
     
    5859                         'write'=>1 
    5960                        }; 
    60  
    6161my $start_time = [gettimeofday()]; 
    6262my $collection = ''; 
    6363my $debug = 0; 
    6464my $flush_delay = 3; 
     65 
    6566GetOptions ('collection=s' => \$collection, 
    6667            'debug' => \$debug, 
    6768            'flushdelay=i' => \$flush_delay) 
    6869or &printUsage('Invalid or missing argument'); 
     70 
    6971if (!defined $collection || $collection eq '') 
    7072{ 
     
    8890print " * Running baseline collection import... "; 
    8991my $dummy_dir = tempdir( CLEANUP => 1); 
    90 my $command1 = 'time -p strace -f -q -T -ttt -s 4096 import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1 1>/dev/null'; 
     92my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1'; 
    9193my $debug_path1 = ''; 
    9294if ($debug) 
    9395{ 
    94   $debug_path1 = $collection_path . '/debug-dummy.csv'; 
    95 } 
    96 my ($baseline_duration, $baseline_io, $baseline_cpu) = &parseStrace($command1, $collection_path, $debug_path1); 
     96  $debug_path1 = $collection_path . '/debug-baseline.tsv'; 
     97} 
     98my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1); 
    9799print "Done\n"; 
    98100print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n"; 
    99 print '   - I/O Duration:  ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; 
     101print '   - SysCall Time:  ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n"; 
     102print '   - SysCall Prct:  ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n"; 
     103print '   - I/O Time:      ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; 
    100104print '   - I/O Percent:   ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n"; 
    101105if ($debug) 
    102106{ 
    103   print "   - See 'debug-dummy.csv' for raw data\n"; 
     107  print "   - See 'debug-baseline.tsv' for raw data\n"; 
    104108} 
    105109print "\n"; 
    106 exit; 
    107110 
    108111# 3. Run normal import command within strace reading output line by line 
     
    110113&flushDiskCache($flush_delay); 
    111114print " * Running import and tracking I/O metrics... "; 
    112 my $command2 = 'strace -f -q -T -ttt -s 256 import.pl -removeold "' . $collection . '" 2>&1'; 
     115my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1'; 
    113116my $debug_path2 = ''; 
    114117if ($debug) 
    115118{ 
    116   $debug_path2 = $collection_path . '/debug-import.csv'; 
    117 } 
    118 my ($duration_import, $duration_io, $duration_cpu, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); 
     119  $debug_path2 = $collection_path . '/debug-import.tsv'; 
     120} 
     121my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); 
    119122print "Done\n"; 
    120123print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n"; 
     
    122125print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 
    123126print '   - Duration:      ' . sprintf('%0.6f', $duration_import) . " seconds\n"; 
     127print '   - SysCall Time:  ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n"; 
     128print '   - SysCall Prct:  ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n"; 
    124129print '   - I/O Duration:  ' . sprintf('%0.6f', $duration_io) . " seconds\n"; 
    125130print '   - I/O Percent:   ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n"; 
     
    128133if ($debug) 
    129134{ 
    130   print "   - See 'debug-import.csv' for raw data\n"; 
     135  print "   - See 'debug-import.tsv' for raw data\n"; 
    131136} 
    132137print "\n"; 
     
    237242  my $end_timestamp = 0; 
    238243  my $io_duration = 0; 
    239   my $cpu_duration = 0; 
     244  my $misc_duration = 0; 
    240245  my $import_complete = 0; 
    241246  my $documents_found = 0; 
     
    258263  if ($debug_path) 
    259264  { 
    260     open(DEBUGOUT, '>:utf8', $debug_path) or die('Error! Failed to open file for writing: '  . $logs_path . '/' . $debug); 
    261     print DEBUGOUT "TIMESTAMP,PID,SYSCALL,DURATION,TOTALIO,TOTALCPU,ARGS,RESULT\n"; 
     265    open(TSVOUT, '>:utf8', $debug_path) or die('Error! Failed to open file for writing: '  . $logs_path . '/' . $debug); 
     266    print TSVOUT "TIMESTAMP\tPID\tSYSCALL\tELAPSED\tTOTALIO\tTOTALMISC\tARGS\tRESULT\n"; 
    262267  } 
    263268 
     
    265270  { 
    266271    my $pid = 0; 
    267     my $line; 
    268     while ($line = <GSI>) 
     272    my $line = <GSI>; 
     273    while ($line) 
    269274    { 
     275      my $next_line = <GSI>; 
    270276      if ($logging_enabled) 
    271277      { 
     
    277283      while ($line !~ /<unfinished \.\.\.>/ && $line !~ /SIG/ && $line =~ /\(/ && $line !~ /\)\s+=/) 
    278284      { 
    279         my $new_line = ''; 
    280         if ($new_line = <GSI>) 
    281         { 
    282           $line .= ' ' . $new_line; 
     285        $line .= ' ' . $next_line; 
     286        if ($next_line = <GSI>) 
     287        { 
    283288        } 
    284289        else 
     
    289294      } 
    290295      # detect and remove any PID information to make the rest of parsing easier 
    291       if ($line =~ /^\[pid (\d+)\]\s+(.*)$/s) 
     296      if ($line =~ /^\[pid\s+(\d+)\]\s+(.*)$/s) 
    292297      { 
    293298        my $new_pid = $1; 
     
    325330          } 
    326331        } 
    327         $line = $timestamp . ' ' . $syscall . '(' . $args_prefix . ' ' . $args_suffix . ') = ' . $result . ' <' . $duration . '>'; 
     332 
     333        # Time for a hack - duration is highly dubious for suspended 
     334        # syscalls. It typically includes the total walltime spent doing 
     335        # anything between when the 'unfinished' and the 'resumed'... time that 
     336        # is already accounted for in other parsed syscalls.  What I'll do is 
     337        # limit the resumed calls duration to, at most, the time difference 
     338        # between this syscall and the next. This is highly inaccurate, of 
     339        # course, as it includes time spent in userspace but is significantly 
     340        # better than a elasped duration several times longer than the syscall 
     341        # actually took. 
     342        if ($next_line =~ /^.*?(\d+\.\d+)/) 
     343        { 
     344          my $next_timestamp = $1; 
     345          my $timestamp_duration = $next_timestamp - $timestamp; 
     346          if ($duration > $timestamp_duration) 
     347          { 
     348            #rint "[resume: elapsed longer than duration... fudging] "; 
     349            $duration = $timestamp_duration; 
     350          } 
     351        } 
     352 
     353        $line = $timestamp . ' ' . $syscall . '(' . $args_prefix . ' ' . $args_suffix . ') = ' . $result . ' <' . sprintf('%0.6f', $duration) . '>'; 
    328354      } 
    329355 
     
    346372      } 
    347373      # - processes announce detachments and resumptions - maybe this explains 
    348       # why wait4's keep chaning 
     374      # why wait4's keep chaning (-q should shut these up) 
    349375      elsif ($line =~ /^Process (\d+) (detached|resumed)/) 
    350376      { 
     
    353379      } 
    354380      # - exit_group never has a duration 
    355       elsif ($line =~ /^(\d+\.\d+)\s+exit_group\(0\)\s+=\s+\?$/) 
     381      elsif ($line =~ /^(\d+\.\d+)\s+exit_group\((\d+)\)\s+=\s+\?$/) 
    356382      { 
    357383        my $timestamp = $1; 
     384        my $exit_value = $2; 
    358385        $end_timestamp = $timestamp; 
    359386        if ($debug) 
    360387        { 
    361           print DEBUGOUT $timestamp . ',' . $pid . ',"exit_group",0,' . $io_duration . ',' . $cpu_duration . ",\"0\",\"\"\n"; 
     388          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t"; 
     389          print TSVOUT $pid . "\t"; 
     390          print TSVOUT "exit_group\t"; 
     391          print TSVOUT "0.000000\t"; 
     392          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t"; 
     393          print TSVOUT sprintf("%0.6f", $io_duration) . "\t"; 
     394          print TSVOUT $exit_value . "\t"; 
     395          print TSVOUT "?\n"; 
    362396        } 
    363397      } 
     
    389423        else 
    390424        { 
    391           $cpu_duration += $duration; 
     425          $misc_duration += $duration; 
    392426          ##print "[$syscall]"; 
    393427        } 
     
    417451        if ($debug) 
    418452        { 
    419           $args =~ s/\"/&quot;/g; 
    420453          $args =~ s/\r?\n//g; 
    421           print DEBUGOUT $timestamp . ',' . $pid . ',"' . $syscall . '",' . $duration . ',' . $io_duration . ',' . $cpu_duration . ',"' . $args . '","' . $result . "\"\n"; 
     454          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t"; 
     455          print TSVOUT $pid . "\t"; 
     456          print TSVOUT $syscall . "\t"; 
     457          print TSVOUT sprintf("%0.6f", $duration) . "\t"; 
     458          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t"; 
     459          print TSVOUT sprintf("%0.6f", $io_duration) . "\t"; 
     460          print TSVOUT $args . "\t"; 
     461          print TSVOUT $result . "\n"; 
    422462        } 
    423463      } 
     
    427467        print "\n\nUnparsed: |" . $line . "|\n\n"; 
    428468      } 
     469      # Move on to the next line... (may be undef) 
     470      $line = $next_line; 
    429471    } 
    430472  } 
     
    440482  if ($debug_path) 
    441483  { 
    442     close(DEBUGOUT); 
     484    close(TSVOUT); 
    443485  } 
    444486  my $cmd_duration = $end_timestamp - $start_timestamp; 
    445   return ($cmd_duration, $io_duration, $cpu_duration, $import_complete, $documents_found, $documents_processed); 
     487  return ($cmd_duration, $io_duration, $misc_duration, $import_complete, $documents_found, $documents_processed); 
    446488} 
    447489# parseStrace()