Changeset 28767 for gs2-extensions

Show
Ignore:
Timestamp:
18.12.2013 10:21:53 (6 years ago)
Author:
jmt12
Message:

Drastically increased the script to allow 1) battery of imports backed by database of tests, 2) printing of reported in dokuwiki format, and 3) calculation of statistical information (STDDEV) and outliers (although I don't do anything with that information... yet)

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl

    r28665 r28767  
    55use warnings; 
    66# Libraries 
    7 use File::Path qw( remove_tree ); 
    8 use File::Temp qw( tempdir ); 
     7use File::Path   qw( remove_tree ); 
     8use File::Temp   qw( tempdir ); 
    99use Getopt::Long; 
    10 use Time::HiRes qw( gettimeofday tv_interval ); 
     10use POSIX        qw( strftime ); 
     11use Time::HiRes  qw( gettimeofday tv_interval ); 
    1112 
    1213BEGIN 
    1314{ 
    14   print "======================= Greenstone Import + I/O Metrics ======================\n\n"; 
     15  print "\n======================= Greenstone Import + I/O Metrics ======================\n\n"; 
    1516  if (!defined $ENV{'GSDLHOME'}) 
    1617  { 
     
    1920  } 
    2021} 
     22 
     23print "Prepare and run a number of collection imports while recording I/O metrics.\n\n"; 
    2124 
    2225# 1. Parse and sanitize arguments - the listing of syscalls whose duration we 
     
    2427# strace_analyzer.pl would use 
    2528print " * Initializing... Done\n"; 
     29my $machine_name = `hostname -s`; 
     30chomp($machine_name); 
     31$machine_name = ucfirst($machine_name); 
     32my $os_name = `lsb_release -i`; 
     33$os_name =~ s/^Distributor ID:\s+(.*)\r?\n$/$1/i; 
     34my $fs_name = `df -T $ENV{'GSDLHOME'}`; 
     35$fs_name =~ s/^.*(ext2|ext3|ext4|xfs|zfs).*$/$1/is; 
     36$fs_name = uc($fs_name); 
     37my $start_time = [gettimeofday()]; 
     38my @collections; 
     39my $print_report = 0; 
    2640our $strace_flags = '-f -q -s 256 -T -ttt'; 
    2741our $io_function_list = { 
     
    4054                         'ftell'=>1, 
    4155                         'getdents'=>1, 
    42                          #'ioctl'=>1, # John added 
     56                         'ioctl'=>1, # John added 
    4357                         'llseek'=>1, 
    4458                         'lockf'=>1, 
     
    5973                         'write'=>1 
    6074                        }; 
    61 my $start_time = [gettimeofday()]; 
    62 my $collection = ''; 
    63 my $debug = 0; 
    64 my $flush_delay = 3; 
    65  
    66 GetOptions ('collection=s' => \$collection, 
     75our $debug = 0; 
     76our $flush_delay = 3; 
     77our $test_runs = 9; 
     78 
     79GetOptions ('collection=s' => \@collections, 
    6780            'debug' => \$debug, 
    68             'flushdelay=i' => \$flush_delay) 
     81            'flushdelay=i' => \$flush_delay, 
     82            'runs=i' => \$test_runs, 
     83            'report' => \$print_report) 
    6984or &printUsage('Invalid or missing argument'); 
    7085 
    71 if (!defined $collection || $collection eq '') 
    72 { 
    73   &printUsage('Missing collection name'); 
    74 } 
    75 my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection; 
    76 if (!-d $collection_path) 
    77 { 
    78   &printUsage('Collection not found: ' . $collection_path); 
    79 } 
    80 print '   - Greenstone:  ' . $ENV{'GSDLHOME'} . "\n"; 
    81 print '   - Collection:  ' . $collection . "\n"; 
    82 print '   - Flush Delay: ' . $flush_delay . "\n"; 
    83 print '   - Debug?       ' . ($debug ? 'Yes' : 'No') . "\n"; 
     86if ($test_runs < 1) 
     87{ 
     88  &printUsage('Test runs must be non-zeo'); 
     89} 
     90my $db_name = 'strace_' . $machine_name . '_' . $os_name . '_' . $fs_name . '.sqlite3db'; 
     91my $db_path = $ENV{'GSDLHOME'} . '/collect/' . $db_name; 
     92 
     93if (0 < $print_report) 
     94{ 
     95  &printReport($db_path); 
     96  exit; 
     97} 
     98 
     99if (0 == scalar(@collections)) 
     100{ 
     101  &printUsage('Missing collection or collections'); 
     102} 
     103 
     104print '   - Greenstone:   ' . $ENV{'GSDLHOME'} . "\n"; 
     105print '   - Collections:  ' . join(',', @collections) . "\n"; 
     106print '   - Test Runs:    ' . $test_runs . "\n"; 
     107print '   - Flush Delay:  ' . $flush_delay . "\n"; 
     108print '   - Debug?        ' . ($debug ? 'Yes' : 'No') . "\n"; 
    84109print "\n"; 
    85110 
    86 # 2. Run dummy import command (empty import directory) within trace to 
    87 # determine baseline 
    88 &clearArchives(); 
    89 &flushDiskCache($flush_delay); 
    90 print " * Running baseline collection import... "; 
    91 my $dummy_dir = tempdir( CLEANUP => 1); 
    92 my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1'; 
    93 my $debug_path1 = ''; 
    94 if ($debug) 
    95 { 
    96   $debug_path1 = $collection_path . '/debug-baseline.tsv'; 
    97 } 
    98 my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1); 
    99 print "Done\n"; 
    100 print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n"; 
    101 print '   - SysCall Time:  ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n"; 
    102 print '   - SysCall Prct:  ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n"; 
    103 print '   - I/O Time:      ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; 
    104 print '   - I/O Percent:   ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n"; 
    105 if ($debug) 
    106 { 
    107   print "   - See 'debug-baseline.tsv' for raw data\n"; 
    108 } 
     111# 0. Create database a populate with tests (if necessary) 
     112if (!-f $db_path) 
     113{ 
     114  print STDOUT ' * Creating database tables... '; 
     115  execSQL($db_path, 'CREATE TABLE IF NOT EXISTS tests (collection TEXT, test_run INTEGER, test_started INTEGER DEFAULT 0, docs_found INTEGER, docs_processed INTEGER, b_elapsed REAL DEFAULT 0, b_io REAL DEFAULT 0, b_other REAL DEFAULT 0, i_elapsed REAL DEFAULT 0, i_io REAL DEFAULT 0, i_other REAL DEFAULT 0, PRIMARY KEY (collection, test_run))'); 
     116  print STDOUT "Done\n"; 
     117} 
     118print ' * Populating database with test runs as necessary... '; 
     119my $new_test_count = 0; 
     120foreach my $collection (@collections) 
     121{ 
     122  for (my $i = 1; $i <= $test_runs; $i++) 
     123  { 
     124    my $search_sql = "SELECT COUNT(*) FROM tests WHERE collection='" . $collection . "' AND test_run=" . $i; 
     125    my $test_count = &getValueSQL($db_path, $search_sql); 
     126    if (0 == $test_count) 
     127    { 
     128      $new_test_count++; 
     129      my $insert_sql = "INSERT INTO tests (collection, test_run) VALUES ('" . $collection . "'," . $i . ")"; 
     130      &execSQL($db_path, $insert_sql); 
     131    } 
     132  } 
     133} 
     134print $new_test_count . " tests added\n"; 
    109135print "\n"; 
    110136 
    111 # 3. Run normal import command within strace reading output line by line 
    112 &clearArchives(); 
    113 &flushDiskCache($flush_delay); 
    114 print " * Running import and tracking I/O metrics... "; 
    115 my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1'; 
    116 my $debug_path2 = ''; 
    117 if ($debug) 
    118 { 
    119   $debug_path2 = $collection_path . '/debug-import.tsv'; 
    120 } 
    121 my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); 
    122 print "Done\n"; 
    123 print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n"; 
    124 print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 
    125 print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 
    126 print '   - Duration:      ' . sprintf('%0.6f', $duration_import) . " seconds\n"; 
    127 print '   - SysCall Time:  ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n"; 
    128 print '   - SysCall Prct:  ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n"; 
    129 print '   - I/O Duration:  ' . sprintf('%0.6f', $duration_io) . " seconds\n"; 
    130 print '   - I/O Percent:   ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n"; 
    131 print "   - See 'import.log' for Greenstone Import details\n"; 
    132 print "   - See 'strace.out' for STrace details\n"; 
    133 if ($debug) 
    134 { 
    135   print "   - See 'debug-import.tsv' for raw data\n"; 
    136 } 
    137 print "\n"; 
    138  
    139 # 4. Results 
    140 print &makeHeader('Import Results', 78) . "\n\n"; 
    141 print " Import Duration:  " . sprintf('%0.6f', ($duration_import - $baseline_duration)) . " seconds\n"; 
    142 print " I/O Duration:     " . sprintf('%0.6f', ($duration_io - $baseline_io)) . " seconds\n"; 
    143 print " I/O Percentage:   " . sprintf('%d', ((($duration_io - $baseline_io)/($duration_import - $baseline_duration)) * 100)) . "%\n"; 
    144 print "\n"; 
    145  
    146 # Complete! 
     137# 1. Continue picking random tests and running them until there are none left 
     138my $total_test_count = &getValueSQL($db_path, 'SELECT COUNT(*) FROM tests'); 
     139my $remaining_test_sql = 'SELECT COUNT(*) FROM tests WHERE test_started=0'; 
     140my $remaining_test_count = getValueSQL($db_path, $remaining_test_sql); 
     141my $exit_file_path = $ENV{'GSDLHOME'} . '/collect/exit.now'; 
     142while ($remaining_test_count ne "0" && !-f $exit_file_path) 
     143{ 
     144  my $test_started = time; 
     145  my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime($test_started); 
     146  my $random_test_sql = 'SELECT collection, test_run FROM tests WHERE test_started=0 ORDER BY RANDOM() LIMIT 1'; 
     147  my ($collection, $test_run) = &getRecordSQL($db_path, $random_test_sql); 
     148 
     149  print &makeHeader('Running test: ' . $collection . '#' . $test_run, 78) . "\n\n"; 
     150  print " * Note: to gracefully exit create a file: <gsdlhome>/collect/exit.now\n"; 
     151  print ' * Started: ' . $now_string . "\n"; 
     152 
     153  my $found_result = 0; 
     154  do 
     155  { 
     156    my @results = &importWithStrace($collection); 
     157    # We don't allow negative results... just try again 
     158    if ($results[0] != $results[1]) 
     159    { 
     160      print "Warning! Failed to process all the documents found - rerunning test...\n"; 
     161    } 
     162    elsif (0 >= $results[1]) 
     163    { 
     164      print "Warning! Failed to process any documents - rerunning test...\n"; 
     165    } 
     166    # I'm not sure how this happens, but it seems on really fast imports they 
     167    # sometimes happen faster than the baseline import?!? 
     168    elsif ($results[5] < $results[2]) 
     169    { 
     170      print "Warning! Baseline import took longer than actual import - rerunning test...\n"; 
     171    } 
     172    elsif ($results[6] < $results[3]) 
     173    { 
     174      print "Warning! Spent more time on I/O syscalls in baseline than during import - rerunning test...\n"; 
     175    } 
     176    elsif ($results[7] < $results[4]) 
     177    { 
     178      print "Warning! Spent more time on non-I/O syscalls in baseline than during import - rerunning test...\n"; 
     179    } 
     180    # I'm also seeing a lot of negative percentages caused by the total delta 
     181    # time spend on io and other syscalls being more than the delta time 
     182    # between import and baseline durations 
     183    elsif (($results[5] - $results[2]) < (($results[6] - $results[3]) + ($results[7] - $results[4]))) 
     184    { 
     185      print "Warning! Time difference between baseline and import durations less than total time differences between io and misc system calls - rerunning test...\n"; 
     186    } 
     187    else 
     188    { 
     189      my $update_sql = 'UPDATE tests SET test_started=' . $test_started . ', '; 
     190      $update_sql .= 'docs_found=' . $results[0] . ', docs_processed=' . $results[1] . ', '; 
     191      $update_sql .= 'b_elapsed=' . sprintf('%0.6f', $results[2]) . ', '; 
     192      $update_sql .= 'b_io=' . sprintf('%0.6f', $results[3]) . ', '; 
     193      $update_sql .= 'b_other=' . sprintf('%0.6f', $results[4]) . ', '; 
     194      $update_sql .= 'i_elapsed=' . sprintf('%0.6f', $results[5]) . ', '; 
     195      $update_sql .= 'i_io=' . sprintf('%0.6f', $results[6]) . ', '; 
     196      $update_sql .= 'i_other=' . sprintf('%0.6f', $results[7]) . ' '; 
     197      $update_sql .= 'WHERE collection=\'' . $collection . '\' AND test_run=' . $test_run; 
     198      execSQL($db_path, $update_sql); 
     199      $found_result = 1; 
     200    } 
     201  } 
     202  while(0 == $found_result); 
     203 
     204  # Repeat until we have exhausted pending tests 
     205  $remaining_test_count = &getValueSQL($db_path, $remaining_test_sql); 
     206 
     207  my $x = $total_test_count - $remaining_test_count; 
     208  print &makeHeader('Test Complete: ' . $collection . '#' . $test_run, 78) . "\n\n"; 
     209  print ' * ' . sprintf("%.0f",(($x/$total_test_count)*100)) . "% of all tests complete!\n"; 
     210  print ' * ' . $remaining_test_count . " tests remaining\n"; 
     211  print "\n"; 
     212} 
     213 
     214if (-f $exit_file_path) 
     215{ 
     216  unlink($exit_file_path); 
     217} 
     218 
     219# 2. Complete! 
    147220my $end_time = [gettimeofday()]; 
    148221my $duration = tv_interval($start_time, $end_time); 
     
    151224exit; 
    152225 
     226################################################################################ 
     227 
     228## @function importWithStrace() 
     229# 
     230sub importWithStrace 
     231{ 
     232  my ($collection) = @_; 
     233  my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection; 
     234  if (!-d $collection_path) 
     235  { 
     236    &printError('Collection not found: ' . $collection_path); 
     237  } 
     238 
     239  # 2. Run dummy import command (empty import directory) within trace to 
     240  # determine baseline 
     241  &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp'); 
     242  &flushDiskCache($flush_delay); 
     243  print " * Running baseline collection import... "; 
     244  my $dummy_dir = tempdir( CLEANUP => 1); 
     245  my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1'; 
     246  my $debug_path1 = ''; 
     247  if ($debug) 
     248  { 
     249    $debug_path1 = $collection_path . '/debug-baseline.tsv'; 
     250  } 
     251  my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1); 
     252  print "Done\n"; 
     253  print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n"; 
     254  print "   - System Calls Breakdown:\n"; 
     255  print '     - I/O Duration:  ' . sprintf('%0.6f', $baseline_io) . " seconds\n"; 
     256  print '     - I/O Percent:   ' . sprintf('%0.1f', ($baseline_io / $baseline_duration) * 100) . "%\n"; 
     257  print '     - Other Duratn:  ' . sprintf('%0.6f', $baseline_misc) . " seconds\n"; 
     258  print '     - Other Percnt:  ' . sprintf('%0.1f', ($baseline_misc / $baseline_duration) * 100) . "%\n"; 
     259  if ($debug) 
     260  { 
     261    print "   - See 'debug-baseline.tsv' for raw data\n"; 
     262  } 
     263  if (-d $dummy_dir) 
     264  { 
     265    rmdir($dummy_dir); 
     266  } 
     267 
     268  # 3. Run normal import command within strace reading output line by line 
     269  &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp'); 
     270  &flushDiskCache($flush_delay); 
     271  print " * Running import and tracking I/O metrics... "; 
     272  my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1'; 
     273  my $debug_path2 = ''; 
     274  if ($debug) 
     275  { 
     276    $debug_path2 = $collection_path . '/debug-import.tsv'; 
     277  } 
     278  my ($import_duration, $import_io, $import_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2); 
     279  print "Done\n"; 
     280  print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n"; 
     281  print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 
     282  print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n"; 
     283  print '   - Duration:      ' . sprintf('%0.6f', $import_duration) . " seconds\n"; 
     284  print "   - System Calls Breakdown:\n"; 
     285  print '     - I/O Duration:  ' . sprintf('%0.6f', $import_io) . " seconds\n"; 
     286  print '     - I/O Percent:   ' . sprintf('%0.1f', ($import_io / $import_duration) * 100) . "%\n"; 
     287  print '     - Other Duratn:  ' . sprintf('%0.6f', $import_misc) . " seconds\n"; 
     288  print '     - Other Percnt:  ' . sprintf('%0.1f', ($import_misc / $import_duration) * 100) . "%\n"; 
     289  print "   - See 'import.log' for Greenstone Import details\n"; 
     290  print "   - See 'strace.out' for STrace details\n"; 
     291  if ($debug) 
     292  { 
     293    print "   - See 'debug-import.tsv' for raw data\n"; 
     294  } 
     295  print "\n"; 
     296 
     297  # 4. Results 
     298  my $delta_duration = $import_duration - $baseline_duration; 
     299  my $delta_io = $import_io - $baseline_io; 
     300  my $delta_io_percent = ( $delta_io / $delta_duration ) * 100; 
     301  my $delta_misc = $import_misc - $baseline_misc; 
     302  my $delta_misc_percent = ( $delta_misc / $delta_duration ) * 100; 
     303  print &makeHeader('Import Results', 78) . "\n\n"; 
     304  print ' Import Duration:  ' . sprintf('%0.6f', $delta_duration) . " seconds\n"; 
     305  print " System Calls Breakdown:\n"; 
     306  print ' - I/O Duration:  ' . sprintf('%0.6f', $delta_io) . " seconds\n"; 
     307  print ' - I/O Percent:   ' . sprintf('%0.1f', $delta_io_percent) . "%\n"; 
     308  print ' - Other Duratn:  ' . sprintf('%0.6f', $delta_misc) . " seconds\n"; 
     309  print ' - Other Percnt:  ' . sprintf('%0.1f', $delta_misc_percent) . "%\n"; 
     310  print "\n"; 
     311 
     312  return ($docs_found, $docs_processed, $baseline_duration, $baseline_io, $baseline_misc, $import_duration, $import_io, $import_misc); 
     313} 
     314## importWithStrace() 
     315 
     316## @function makeHeader($msg, [$length]) 
     317# 
     318# Create a centered header string given a certain message padded with '=' characters. 
     319# 
     320# @param $msg The message to center as a string 
     321# @param $length The desired length of string - defaults to 79 
     322# @return A string centered with '=' as padding 
     323# 
     324sub makeHeader 
     325{ 
     326  my ($msg, $length) = @_; 
     327  if (!defined $length) 
     328  { 
     329    $length = 79; # 80 with newline 
     330  } 
     331  if (length($msg) % 2 == 0) 
     332  { 
     333    $msg .= ' '; 
     334  } 
     335  my $filler_length = ($length - length($msg)) / 2; 
     336  my $filler = '=' x $filler_length; 
     337  $msg = $filler . ' ' . $msg . ' ' . $filler; 
     338  return $msg; 
     339} 
     340## makeHeader() ## 
     341 
    153342 
    154343## @function 
    155 # 
    156 sub makeHeader 
    157 { 
    158   my ($msg, $length) = @_; 
    159   if (length($msg) % 2 == 1) 
    160   { 
    161     $msg = ' ' . $msg . '  '; 
    162   } 
    163   else 
    164   { 
    165     $msg = ' ' . $msg . ' '; 
    166   } 
    167   while (length($msg) < $length) 
    168   { 
    169     $msg = '=' . $msg . '='; 
    170   } 
    171   return $msg; 
    172 } 
    173 ## makeHeader() ## 
    174  
    175  
    176 ## @function 
    177 # 
    178 sub clearArchives 
    179 { 
    180   # 2. Remove any existing archives directory, so it doesn't get factored in the 
    181   # IO costs 
    182   my $archives_dir = $collection_path . '/archives'; 
    183   if (-d $archives_dir) 
    184   { 
    185     print " * Deleting existing archives directory... "; 
    186     remove_tree($archives_dir); 
    187     print "Done\n"; 
    188   } 
    189 } 
    190 ## clearArchives() 
    191  
     344#  Remove named Greenstone directory so it doesn't get factored in I/O costs  
     345# 
     346sub clearExistingDirectory 
     347{ 
     348  my $collection_path = shift(@_); 
     349  for my $dirname (@_) 
     350  { 
     351    my $dir = $collection_path . '/' . $dirname; 
     352    if (-d $dir) 
     353    { 
     354      print ' * Deleting existing ' . $dirname . ' directory... '; 
     355      remove_tree($dir); 
     356      print "Done\n"; 
     357    } 
     358  } 
     359} 
     360## clearExistingDirectory() 
    192361 
    193362## @function 
     
    229398    $flush_delay--; 
    230399  } 
    231   print "Done\n\n"; 
     400  print "Done\n"; 
    232401} 
    233402## flushDiskCache() 
     
    257426    my $gslog_path = $logs_path . '/import.log'; 
    258427    open(GSLOGOUT, '>:utf8', $gslog_path) or die("Error! Failed to open file for writing: " . $gslog_path); 
    259     $logging_enabled = 1; 
     428    #$logging_enabled = 1; 
    260429  } 
    261430 
     
    278447        print STRACEOUT $line; 
    279448      } 
     449 
     450      if ($line =~ /^[^\d\[]+(.*)/) 
     451      { 
     452        $line = $1; 
     453      } 
     454 
    280455      # we may have a line that unfortunately includes newlines in its arguments list 
    281456      # - watch out for unfinished syscalls that will be resumed later 
     
    337512        # limit the resumed calls duration to, at most, the time difference 
    338513        # between this syscall and the next. This is highly inaccurate, of 
    339         # course, as it includes time spent in userspace but is significantly 
    340         # better than a elasped duration several times longer than the syscall 
    341         # actually took. 
     514        # course, as it excludes time spent before the suspension but then 
     515        # includes time spent in userspace but is significantly better than a 
     516        # elasped duration several times longer than the syscall actually took. 
    342517        if ($next_line =~ /^.*?(\d+\.\d+)/) 
    343518        { 
     
    363538        if ($syscall eq 'write' && $args_prefix =~ /^[12],/) 
    364539        { 
    365           print "\n\nPending write pid:$pid line:|" . $line . "|\n"; 
     540          #print "\n\nPending write pid:$pid line:|" . $line . "|\n"; 
    366541          if (!defined $interrupted_stacks->{$pid}) 
    367542          { 
     
    379554      } 
    380555      # - exit_group never has a duration 
    381       elsif ($line =~ /^(\d+\.\d+)\s+exit_group\((\d+)\)\s+=\s+\?$/) 
     556      elsif ($line =~ /^(\d+\.\d+)\s+(_exit|exit_group)\((\d+)\)\s+=\s+\?$/) 
    382557      { 
    383558        my $timestamp = $1; 
    384         my $exit_value = $2; 
     559        my $exit_function = $2; 
     560        my $exit_value = $3; 
    385561        $end_timestamp = $timestamp; 
    386562        if ($debug) 
     
    388564          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t"; 
    389565          print TSVOUT $pid . "\t"; 
    390           print TSVOUT "exit_group\t"; 
     566          print TSVOUT $exit_function . "\t"; 
    391567          print TSVOUT "0.000000\t"; 
    392568          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t"; 
     
    510686} 
    511687 
     688## @function 
     689# 
     690sub execSQL 
     691{ 
     692  my ($db_path, $sql) = @_; 
     693  # call getValueSQL but don't care about result 
     694  getValueSQL($db_path, $sql); 
     695} 
     696# /** execSQL() **/ 
     697 
     698 
     699## @function 
     700# 
     701sub getRecordSQL 
     702{ 
     703  my ($db_path, $sql) = @_; 
     704  if ($sql !~ /LIMIT 1/i) 
     705  { 
     706    $sql .= ' LIMIT 1'; 
     707  } 
     708  my $value = getValueSQL($db_path, $sql); 
     709  return split(/\|/,$value); 
     710} 
     711# /** getRecordSQL() **/ 
     712 
     713 
     714## @function 
     715# 
     716sub getRecordsSQL 
     717{ 
     718  my ($db_path, $sql) = @_; 
     719  my @records; 
     720  my $raw_values = getValueSQL($db_path, $sql); 
     721  foreach my $raw_record (split(/\r?\n/, $raw_values)) 
     722  { 
     723    my @record = split(/\|/, $raw_record); 
     724    push(@records, \@record); 
     725  } 
     726  return @records; 
     727} 
     728## getRecordsSQL() 
     729 
     730 
     731## @function 
     732# 
     733sub getValueSQL 
     734{ 
     735  my ($db_path, $sql) = @_; 
     736  my $result = `sqlite3 "$db_path" "$sql" 2>&1`; 
     737  if ($result =~ /Error:/) 
     738  { 
     739    die("Fatal Error!\nSQL:" . $sql . "\nMsg:" . $result); 
     740  } 
     741  # trim 
     742  $result =~ s/^\s*|\s*$//g; 
     743  return $result; 
     744} 
     745# /** getValueSQL() **/ 
     746 
     747 
     748## @function printReport() 
     749# 
     750sub printReport 
     751{ 
     752  my ($db_path) = @_; 
     753  # get listing of unique collection names sorted alphabetically.  While we are 
     754  # at it, grab the number of documents processed from pretty much any entry 
     755  # for this collection (they should all be the same, otherwise they would've 
     756  # been repeated back in testing) 
     757  my $collection_sql = 'SELECT collection, AVG(docs_processed) FROM tests GROUP BY collection'; 
     758  my @collection_records = &getRecordsSQL($db_path, $collection_sql); 
     759  foreach my $collection_record (@collection_records) 
     760  { 
     761    my ($collection, $docs_processed) = @{$collection_record}; 
     762    # determine the size in bytes of the import directory 
     763    my $collection_import_directory = $ENV{'GSDLHOME'} . '/collect/' . $collection . '/import'; 
     764    my $du_command = 'du -bsL "' . $collection_import_directory . '"'; 
     765    my $du_result = `$du_command`; 
     766    my $size_in_bytes = 0; 
     767    if ($du_result =~ /^(\d+)\s+/) 
     768    { 
     769      $size_in_bytes = $1; 
     770    } 
     771    # pretty print header block for dokuwiki 
     772    &printReportHeader($collection, $docs_processed, $size_in_bytes); 
     773    my @test_records = &getRecordsSQL($db_path, 'SELECT * FROM tests WHERE collection=\'' . $collection . '\' AND test_started > 0 ORDER BY test_started'); 
     774    my $number_of_tests = scalar(@test_records); 
     775    my $total_b_elapsed = 0; 
     776    my @b_elapsed_values; 
     777    my $total_b_io_percent = 0; 
     778    my @b_io_percent_values; 
     779    my $total_b_other_percent = 0; 
     780    my @b_other_percent_values; 
     781    my $total_d_elapsed = 0; 
     782    my @d_elapsed_values; 
     783    my $total_d_io_percent = 0; 
     784    my @d_io_percent_values; 
     785    my $total_d_other_percent = 0; 
     786    my @d_other_percent_values; 
     787    foreach my $test_record (@test_records) 
     788    { 
     789      my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record}; 
     790      my $b_io_percent = ($b_io / $b_elapsed) * 100; 
     791      my $b_other_percent = ($b_other / $b_elapsed) * 100; 
     792      my $d_elapsed = $i_elapsed - $b_elapsed; 
     793      my $d_io = $i_io - $b_io; 
     794      my $d_io_percent = ($d_io / $d_elapsed) * 100; 
     795      my $d_other = $i_other - $b_other; 
     796      my $d_other_percent = ($d_other / $d_elapsed) * 100; 
     797      # add to running totals 
     798      $total_b_elapsed += $b_elapsed; 
     799      push(@b_elapsed_values, $b_elapsed); 
     800      $total_b_io_percent += $b_io_percent; 
     801      push(@b_io_percent_values, $b_io_percent); 
     802      $total_b_other_percent += $b_other_percent; 
     803      push(@b_other_percent_values, $b_other_percent); 
     804      $total_d_elapsed += $d_elapsed; 
     805      push(@d_elapsed_values, $d_elapsed); 
     806      $total_d_io_percent += $d_io_percent; 
     807      push(@d_io_percent_values, $d_io_percent); 
     808      $total_d_other_percent += $d_other_percent; 
     809      push(@d_other_percent_values, $d_other_percent); 
     810    } 
     811    # Averages 
     812    my $b_elapsed_avg = $total_b_elapsed / $number_of_tests; 
     813    my $b_io_percent_avg = $total_b_io_percent / $number_of_tests; 
     814    my $b_other_percent_avg = $total_b_other_percent / $number_of_tests; 
     815    my $d_elapsed_avg = $total_d_elapsed / $number_of_tests; 
     816    my $d_io_percent_avg = $total_d_io_percent / $number_of_tests; 
     817    my $d_other_percent_avg = $total_d_other_percent / $number_of_tests; 
     818    # Standard Deviations 
     819    my $b_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $b_elapsed_avg, @b_elapsed_values); 
     820    my $b_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_io_percent_avg, @b_io_percent_values); 
     821    my $b_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_other_percent_avg, @b_other_percent_values); 
     822    my $d_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $d_elapsed_avg, @d_elapsed_values); 
     823    my $d_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_io_percent_avg, @d_io_percent_values); 
     824    my $d_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_other_percent_avg, @d_other_percent_values); 
     825 
     826    my $counter = 0; 
     827    my $outlier_count = 0; 
     828    foreach my $test_record (@test_records) 
     829    { 
     830      my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record}; 
     831      $counter++; 
     832      my $b_io_percent = ($b_io / $b_elapsed) * 100; 
     833      my $b_other_percent = ($b_other / $b_elapsed) * 100; 
     834      my $d_elapsed = $i_elapsed - $b_elapsed; 
     835      my $d_io = $i_io - $b_io; 
     836      my $d_io_percent = ($d_io / $d_elapsed) * 100; 
     837      my $d_other = $i_other - $b_other; 
     838      my $d_other_percent = ($d_other / $d_elapsed) * 100; 
     839      print '|  ' . $counter . ' |  ' . $the_collection . '  |  ' . $docs_processed . ' |  ' . $size_in_bytes . ' |  ' . $test_started . ' |  ' . sprintf('%0.6f', $b_elapsed) . ' |  ' . sprintf('%0.1f', $b_io_percent) . '% |  ' . sprintf('%0.1f', $b_other_percent) . '% |  ' . sprintf('%0.6f', $d_elapsed) . ' |  ' . sprintf('%0.1f', $d_io_percent) . '% |  ' . sprintf('%0.1f', $d_other_percent) . '% |  '; 
     840      if (&isOutlier($d_io_percent, $d_io_percent_avg, $d_io_percent_stddev)) 
     841      { 
     842        print 'Yes'; 
     843        $outlier_count++; 
     844      } 
     845      else 
     846      { 
     847        print 'No'; 
     848      } 
     849      print "  |\n"; 
     850    } 
     851    print '^  Average |||||  ' . sprintf('%0.6f', $b_elapsed_avg) . ' |  ' . sprintf('%0.1f', $b_io_percent_avg) . '% |  ' . sprintf('%0.1f', $b_other_percent_avg) . '% |  ' .  sprintf('%0.6f', $d_elapsed_avg) . ' |  ' . sprintf('%0.1f', $d_io_percent_avg) . '% |  ' . sprintf('%0.1f', $d_other_percent_avg) . "% |\n"; 
     852    print '^  Standard Deviation |||||  ' . sprintf('%0.6f', $b_elapsed_stddev) . ' |  ' . sprintf('%0.1f', $b_io_percent_stddev) . '% |  ' . sprintf('%0.1f', $b_other_percent_stddev) . '% |  ' . sprintf('%0.6f', $d_elapsed_stddev) . ' |  ' . sprintf('%0.1f', $d_io_percent_stddev) . '% |  ' . sprintf('%0.1f', $d_other_percent_stddev) . '% |  ' . $outlier_count . " |\n"; 
     853    print "\n"; 
     854  } 
     855} 
     856## printReport() 
     857 
     858 
     859## @function printReportHeader($collection) 
     860# 
     861sub printReportHeader 
     862{ 
     863  my ($collection) = @_; 
     864  print '==== "' . $collection . "\" Collection ====\n\n"; 
     865  print "^ Count  ^ Collection  ^ NumDocs  ^ Size  ^ Timestamp  ^  Base                    ^^^  Import                 ^^^ Outlier?  ^\n"; 
     866  print "^ :::    ^ :::         ^ :::      ^ :::   ^ :::        ^ Elapsed  ^  Syscalls      ^^ Elapsed  ^  Syscalls     ^^ :::       ^\n"; 
     867  print "^ :::    ^ :::         ^ :::      ^ :::   ^ :::        ^ :::      ^ I/O%  ^ Other%  ^ :::      ^ I/O%  ^ Other  ^ :::       ^\n"; 
     868} 
     869## printReportHeader() 
     870 
     871 
     872## @function calculateStandardDeviation($mean, $value1 ... $valueN) 
     873#  Our sample size is the population so we can use the simplier standard deviation equation for an entire population (now, theres some rap lyrics) 
     874#  stddev = squareroot ( sum ( square( value - mean ) ) / count ) 
     875sub calculateStandardDeviation 
     876{ 
     877  my $population = shift(@_); 
     878  my $mean = shift(@_); 
     879  my $count = scalar(@_); 
     880  my $total = 0; 
     881  # sum of squares 
     882  foreach my $value (@_) 
     883  { 
     884    my $subtotal = $value - $mean; 
     885    $total += $subtotal * $subtotal; 
     886  } 
     887  if ($population == $count) 
     888  { 
     889    $total = $total / $count; 
     890  } 
     891  else 
     892  { 
     893    $total = $total / ($count - 1); 
     894  } 
     895  my $standard_deviation = sqrt($total); 
     896  return $standard_deviation; 
     897} 
     898## calculateStandardDeviation() 
     899 
     900 
     901## @function isOutlier($value, $mean, $stddev) 
     902# 
     903sub isOutlier 
     904{ 
     905  my ($value, $mean, $stddev) = @_; 
     906  my $min = $mean - (2 * $stddev); 
     907  my $max = $mean + (2 * $stddev); 
     908  return ($value <= $min || $max <= $value); 
     909} 
     910## isOutlier() 
     911 
    5129121;