Ignore:
Timestamp:
2013-12-18T10:21:53+13:00 (10 years ago)
Author:
jmt12
Message:

Drastically increased the script to allow 1) battery of imports backed by database of tests, 2) printing of reported in dokuwiki format, and 3) calculation of statistical information (STDDEV) and outliers (although I don't do anything with that information... yet)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl

    r28665 r28767  
    55use warnings;
    66# Libraries
    7 use File::Path qw( remove_tree );
    8 use File::Temp qw( tempdir );
     7use File::Path   qw( remove_tree );
     8use File::Temp   qw( tempdir );
    99use Getopt::Long;
    10 use Time::HiRes qw( gettimeofday tv_interval );
     10use POSIX        qw( strftime );
     11use Time::HiRes  qw( gettimeofday tv_interval );
    1112
    1213BEGIN
    1314{
    14   print "======================= Greenstone Import + I/O Metrics ======================\n\n";
     15  print "\n======================= Greenstone Import + I/O Metrics ======================\n\n";
    1516  if (!defined $ENV{'GSDLHOME'})
    1617  {
     
    1920  }
    2021}
     22
     23print "Prepare and run a number of collection imports while recording I/O metrics.\n\n";
    2124
    2225# 1. Parse and sanitize arguments - the listing of syscalls whose duration we
     
    2427# strace_analyzer.pl would use
    2528print " * Initializing... Done\n";
     29my $machine_name = `hostname -s`;
     30chomp($machine_name);
     31$machine_name = ucfirst($machine_name);
     32my $os_name = `lsb_release -i`;
     33$os_name =~ s/^Distributor ID:\s+(.*)\r?\n$/$1/i;
     34my $fs_name = `df -T $ENV{'GSDLHOME'}`;
     35$fs_name =~ s/^.*(ext2|ext3|ext4|xfs|zfs).*$/$1/is;
     36$fs_name = uc($fs_name);
     37my $start_time = [gettimeofday()];
     38my @collections;
     39my $print_report = 0;
    2640our $strace_flags = '-f -q -s 256 -T -ttt';
    2741our $io_function_list = {
     
    4054                         'ftell'=>1,
    4155                         'getdents'=>1,
    42                          #'ioctl'=>1, # John added
     56                         'ioctl'=>1, # John added
    4357                         'llseek'=>1,
    4458                         'lockf'=>1,
     
    5973                         'write'=>1
    6074                        };
    61 my $start_time = [gettimeofday()];
    62 my $collection = '';
    63 my $debug = 0;
    64 my $flush_delay = 3;
    65 
    66 GetOptions ('collection=s' => \$collection,
     75our $debug = 0;
     76our $flush_delay = 3;
     77our $test_runs = 9;
     78
     79GetOptions ('collection=s' => \@collections,
    6780            'debug' => \$debug,
    68             'flushdelay=i' => \$flush_delay)
     81            'flushdelay=i' => \$flush_delay,
     82            'runs=i' => \$test_runs,
     83            'report' => \$print_report)
    6984or &printUsage('Invalid or missing argument');
    7085
    71 if (!defined $collection || $collection eq '')
    72 {
    73   &printUsage('Missing collection name');
    74 }
    75 my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection;
    76 if (!-d $collection_path)
    77 {
    78   &printUsage('Collection not found: ' . $collection_path);
    79 }
    80 print '   - Greenstone:  ' . $ENV{'GSDLHOME'} . "\n";
    81 print '   - Collection:  ' . $collection . "\n";
    82 print '   - Flush Delay: ' . $flush_delay . "\n";
    83 print '   - Debug?       ' . ($debug ? 'Yes' : 'No') . "\n";
     86if ($test_runs < 1)
     87{
     88  &printUsage('Test runs must be non-zeo');
     89}
     90my $db_name = 'strace_' . $machine_name . '_' . $os_name . '_' . $fs_name . '.sqlite3db';
     91my $db_path = $ENV{'GSDLHOME'} . '/collect/' . $db_name;
     92
     93if (0 < $print_report)
     94{
     95  &printReport($db_path);
     96  exit;
     97}
     98
     99if (0 == scalar(@collections))
     100{
     101  &printUsage('Missing collection or collections');
     102}
     103
     104print '   - Greenstone:   ' . $ENV{'GSDLHOME'} . "\n";
     105print '   - Collections:  ' . join(',', @collections) . "\n";
     106print '   - Test Runs:    ' . $test_runs . "\n";
     107print '   - Flush Delay:  ' . $flush_delay . "\n";
     108print '   - Debug?        ' . ($debug ? 'Yes' : 'No') . "\n";
    84109print "\n";
    85110
    86 # 2. Run dummy import command (empty import directory) within trace to
    87 # determine baseline
    88 &clearArchives();
    89 &flushDiskCache($flush_delay);
    90 print " * Running baseline collection import... ";
    91 my $dummy_dir = tempdir( CLEANUP => 1);
    92 my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1';
    93 my $debug_path1 = '';
    94 if ($debug)
    95 {
    96   $debug_path1 = $collection_path . '/debug-baseline.tsv';
    97 }
    98 my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1);
    99 print "Done\n";
    100 print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n";
    101 print '   - SysCall Time:  ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n";
    102 print '   - SysCall Prct:  ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n";
    103 print '   - I/O Time:      ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
    104 print '   - I/O Percent:   ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n";
    105 if ($debug)
    106 {
    107   print "   - See 'debug-baseline.tsv' for raw data\n";
    108 }
     111# 0. Create database a populate with tests (if necessary)
     112if (!-f $db_path)
     113{
     114  print STDOUT ' * Creating database tables... ';
     115  execSQL($db_path, 'CREATE TABLE IF NOT EXISTS tests (collection TEXT, test_run INTEGER, test_started INTEGER DEFAULT 0, docs_found INTEGER, docs_processed INTEGER, b_elapsed REAL DEFAULT 0, b_io REAL DEFAULT 0, b_other REAL DEFAULT 0, i_elapsed REAL DEFAULT 0, i_io REAL DEFAULT 0, i_other REAL DEFAULT 0, PRIMARY KEY (collection, test_run))');
     116  print STDOUT "Done\n";
     117}
     118print ' * Populating database with test runs as necessary... ';
     119my $new_test_count = 0;
     120foreach my $collection (@collections)
     121{
     122  for (my $i = 1; $i <= $test_runs; $i++)
     123  {
     124    my $search_sql = "SELECT COUNT(*) FROM tests WHERE collection='" . $collection . "' AND test_run=" . $i;
     125    my $test_count = &getValueSQL($db_path, $search_sql);
     126    if (0 == $test_count)
     127    {
     128      $new_test_count++;
     129      my $insert_sql = "INSERT INTO tests (collection, test_run) VALUES ('" . $collection . "'," . $i . ")";
     130      &execSQL($db_path, $insert_sql);
     131    }
     132  }
     133}
     134print $new_test_count . " tests added\n";
    109135print "\n";
    110136
    111 # 3. Run normal import command within strace reading output line by line
    112 &clearArchives();
    113 &flushDiskCache($flush_delay);
    114 print " * Running import and tracking I/O metrics... ";
    115 my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1';
    116 my $debug_path2 = '';
    117 if ($debug)
    118 {
    119   $debug_path2 = $collection_path . '/debug-import.tsv';
    120 }
    121 my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
    122 print "Done\n";
    123 print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n";
    124 print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
    125 print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
    126 print '   - Duration:      ' . sprintf('%0.6f', $duration_import) . " seconds\n";
    127 print '   - SysCall Time:  ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n";
    128 print '   - SysCall Prct:  ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n";
    129 print '   - I/O Duration:  ' . sprintf('%0.6f', $duration_io) . " seconds\n";
    130 print '   - I/O Percent:   ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n";
    131 print "   - See 'import.log' for Greenstone Import details\n";
    132 print "   - See 'strace.out' for STrace details\n";
    133 if ($debug)
    134 {
    135   print "   - See 'debug-import.tsv' for raw data\n";
    136 }
    137 print "\n";
    138 
    139 # 4. Results
    140 print &makeHeader('Import Results', 78) . "\n\n";
    141 print " Import Duration:  " . sprintf('%0.6f', ($duration_import - $baseline_duration)) . " seconds\n";
    142 print " I/O Duration:     " . sprintf('%0.6f', ($duration_io - $baseline_io)) . " seconds\n";
    143 print " I/O Percentage:   " . sprintf('%d', ((($duration_io - $baseline_io)/($duration_import - $baseline_duration)) * 100)) . "%\n";
    144 print "\n";
    145 
    146 # Complete!
     137# 1. Continue picking random tests and running them until there are none left
     138my $total_test_count = &getValueSQL($db_path, 'SELECT COUNT(*) FROM tests');
     139my $remaining_test_sql = 'SELECT COUNT(*) FROM tests WHERE test_started=0';
     140my $remaining_test_count = getValueSQL($db_path, $remaining_test_sql);
     141my $exit_file_path = $ENV{'GSDLHOME'} . '/collect/exit.now';
     142while ($remaining_test_count ne "0" && !-f $exit_file_path)
     143{
     144  my $test_started = time;
     145  my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime($test_started);
     146  my $random_test_sql = 'SELECT collection, test_run FROM tests WHERE test_started=0 ORDER BY RANDOM() LIMIT 1';
     147  my ($collection, $test_run) = &getRecordSQL($db_path, $random_test_sql);
     148
     149  print &makeHeader('Running test: ' . $collection . '#' . $test_run, 78) . "\n\n";
     150  print " * Note: to gracefully exit create a file: <gsdlhome>/collect/exit.now\n";
     151  print ' * Started: ' . $now_string . "\n";
     152
     153  my $found_result = 0;
     154  do
     155  {
     156    my @results = &importWithStrace($collection);
     157    # We don't allow negative results... just try again
     158    if ($results[0] != $results[1])
     159    {
     160      print "Warning! Failed to process all the documents found - rerunning test...\n";
     161    }
     162    elsif (0 >= $results[1])
     163    {
     164      print "Warning! Failed to process any documents - rerunning test...\n";
     165    }
     166    # I'm not sure how this happens, but it seems on really fast imports they
     167    # sometimes happen faster than the baseline import?!?
     168    elsif ($results[5] < $results[2])
     169    {
     170      print "Warning! Baseline import took longer than actual import - rerunning test...\n";
     171    }
     172    elsif ($results[6] < $results[3])
     173    {
     174      print "Warning! Spent more time on I/O syscalls in baseline than during import - rerunning test...\n";
     175    }
     176    elsif ($results[7] < $results[4])
     177    {
     178      print "Warning! Spent more time on non-I/O syscalls in baseline than during import - rerunning test...\n";
     179    }
     180    # I'm also seeing a lot of negative percentages caused by the total delta
     181    # time spend on io and other syscalls being more than the delta time
     182    # between import and baseline durations
     183    elsif (($results[5] - $results[2]) < (($results[6] - $results[3]) + ($results[7] - $results[4])))
     184    {
     185      print "Warning! Time difference between baseline and import durations less than total time differences between io and misc system calls - rerunning test...\n";
     186    }
     187    else
     188    {
     189      my $update_sql = 'UPDATE tests SET test_started=' . $test_started . ', ';
     190      $update_sql .= 'docs_found=' . $results[0] . ', docs_processed=' . $results[1] . ', ';
     191      $update_sql .= 'b_elapsed=' . sprintf('%0.6f', $results[2]) . ', ';
     192      $update_sql .= 'b_io=' . sprintf('%0.6f', $results[3]) . ', ';
     193      $update_sql .= 'b_other=' . sprintf('%0.6f', $results[4]) . ', ';
     194      $update_sql .= 'i_elapsed=' . sprintf('%0.6f', $results[5]) . ', ';
     195      $update_sql .= 'i_io=' . sprintf('%0.6f', $results[6]) . ', ';
     196      $update_sql .= 'i_other=' . sprintf('%0.6f', $results[7]) . ' ';
     197      $update_sql .= 'WHERE collection=\'' . $collection . '\' AND test_run=' . $test_run;
     198      execSQL($db_path, $update_sql);
     199      $found_result = 1;
     200    }
     201  }
     202  while(0 == $found_result);
     203
     204  # Repeat until we have exhausted pending tests
     205  $remaining_test_count = &getValueSQL($db_path, $remaining_test_sql);
     206
     207  my $x = $total_test_count - $remaining_test_count;
     208  print &makeHeader('Test Complete: ' . $collection . '#' . $test_run, 78) . "\n\n";
     209  print ' * ' . sprintf("%.0f",(($x/$total_test_count)*100)) . "% of all tests complete!\n";
     210  print ' * ' . $remaining_test_count . " tests remaining\n";
     211  print "\n";
     212}
     213
     214if (-f $exit_file_path)
     215{
     216  unlink($exit_file_path);
     217}
     218
     219# 2. Complete!
    147220my $end_time = [gettimeofday()];
    148221my $duration = tv_interval($start_time, $end_time);
     
    151224exit;
    152225
     226################################################################################
     227
     228## @function importWithStrace()
     229#
     230sub importWithStrace
     231{
     232  my ($collection) = @_;
     233  my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection;
     234  if (!-d $collection_path)
     235  {
     236    &printError('Collection not found: ' . $collection_path);
     237  }
     238
     239  # 2. Run dummy import command (empty import directory) within trace to
     240  # determine baseline
     241  &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp');
     242  &flushDiskCache($flush_delay);
     243  print " * Running baseline collection import... ";
     244  my $dummy_dir = tempdir( CLEANUP => 1);
     245  my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1';
     246  my $debug_path1 = '';
     247  if ($debug)
     248  {
     249    $debug_path1 = $collection_path . '/debug-baseline.tsv';
     250  }
     251  my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1);
     252  print "Done\n";
     253  print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n";
     254  print "   - System Calls Breakdown:\n";
     255  print '     - I/O Duration:  ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
     256  print '     - I/O Percent:   ' . sprintf('%0.1f', ($baseline_io / $baseline_duration) * 100) . "%\n";
     257  print '     - Other Duratn:  ' . sprintf('%0.6f', $baseline_misc) . " seconds\n";
     258  print '     - Other Percnt:  ' . sprintf('%0.1f', ($baseline_misc / $baseline_duration) * 100) . "%\n";
     259  if ($debug)
     260  {
     261    print "   - See 'debug-baseline.tsv' for raw data\n";
     262  }
     263  if (-d $dummy_dir)
     264  {
     265    rmdir($dummy_dir);
     266  }
     267
     268  # 3. Run normal import command within strace reading output line by line
     269  &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp');
     270  &flushDiskCache($flush_delay);
     271  print " * Running import and tracking I/O metrics... ";
     272  my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1';
     273  my $debug_path2 = '';
     274  if ($debug)
     275  {
     276    $debug_path2 = $collection_path . '/debug-import.tsv';
     277  }
     278  my ($import_duration, $import_io, $import_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
     279  print "Done\n";
     280  print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n";
     281  print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
     282  print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
     283  print '   - Duration:      ' . sprintf('%0.6f', $import_duration) . " seconds\n";
     284  print "   - System Calls Breakdown:\n";
     285  print '     - I/O Duration:  ' . sprintf('%0.6f', $import_io) . " seconds\n";
     286  print '     - I/O Percent:   ' . sprintf('%0.1f', ($import_io / $import_duration) * 100) . "%\n";
     287  print '     - Other Duratn:  ' . sprintf('%0.6f', $import_misc) . " seconds\n";
     288  print '     - Other Percnt:  ' . sprintf('%0.1f', ($import_misc / $import_duration) * 100) . "%\n";
     289  print "   - See 'import.log' for Greenstone Import details\n";
     290  print "   - See 'strace.out' for STrace details\n";
     291  if ($debug)
     292  {
     293    print "   - See 'debug-import.tsv' for raw data\n";
     294  }
     295  print "\n";
     296
     297  # 4. Results
     298  my $delta_duration = $import_duration - $baseline_duration;
     299  my $delta_io = $import_io - $baseline_io;
     300  my $delta_io_percent = ( $delta_io / $delta_duration ) * 100;
     301  my $delta_misc = $import_misc - $baseline_misc;
     302  my $delta_misc_percent = ( $delta_misc / $delta_duration ) * 100;
     303  print &makeHeader('Import Results', 78) . "\n\n";
     304  print ' Import Duration:  ' . sprintf('%0.6f', $delta_duration) . " seconds\n";
     305  print " System Calls Breakdown:\n";
     306  print ' - I/O Duration:  ' . sprintf('%0.6f', $delta_io) . " seconds\n";
     307  print ' - I/O Percent:   ' . sprintf('%0.1f', $delta_io_percent) . "%\n";
     308  print ' - Other Duratn:  ' . sprintf('%0.6f', $delta_misc) . " seconds\n";
     309  print ' - Other Percnt:  ' . sprintf('%0.1f', $delta_misc_percent) . "%\n";
     310  print "\n";
     311
     312  return ($docs_found, $docs_processed, $baseline_duration, $baseline_io, $baseline_misc, $import_duration, $import_io, $import_misc);
     313}
     314## importWithStrace()
     315
     316## @function makeHeader($msg, [$length])
     317#
     318# Create a centered header string given a certain message padded with '=' characters.
     319#
     320# @param $msg The message to center as a string
     321# @param $length The desired length of string - defaults to 79
     322# @return A string centered with '=' as padding
     323#
     324sub makeHeader
     325{
     326  my ($msg, $length) = @_;
     327  if (!defined $length)
     328  {
     329    $length = 79; # 80 with newline
     330  }
     331  if (length($msg) % 2 == 0)
     332  {
     333    $msg .= ' ';
     334  }
     335  my $filler_length = ($length - length($msg)) / 2;
     336  my $filler = '=' x $filler_length;
     337  $msg = $filler . ' ' . $msg . ' ' . $filler;
     338  return $msg;
     339}
     340## makeHeader() ##
     341
    153342
    154343## @function
    155 #
    156 sub makeHeader
    157 {
    158   my ($msg, $length) = @_;
    159   if (length($msg) % 2 == 1)
    160   {
    161     $msg = ' ' . $msg . '  ';
    162   }
    163   else
    164   {
    165     $msg = ' ' . $msg . ' ';
    166   }
    167   while (length($msg) < $length)
    168   {
    169     $msg = '=' . $msg . '=';
    170   }
    171   return $msg;
    172 }
    173 ## makeHeader() ##
    174 
    175 
    176 ## @function
    177 #
    178 sub clearArchives
    179 {
    180   # 2. Remove any existing archives directory, so it doesn't get factored in the
    181   # IO costs
    182   my $archives_dir = $collection_path . '/archives';
    183   if (-d $archives_dir)
    184   {
    185     print " * Deleting existing archives directory... ";
    186     remove_tree($archives_dir);
    187     print "Done\n";
    188   }
    189 }
    190 ## clearArchives()
    191 
     344#  Remove named Greenstone directory so it doesn't get factored in I/O costs
     345#
     346sub clearExistingDirectory
     347{
     348  my $collection_path = shift(@_);
     349  for my $dirname (@_)
     350  {
     351    my $dir = $collection_path . '/' . $dirname;
     352    if (-d $dir)
     353    {
     354      print ' * Deleting existing ' . $dirname . ' directory... ';
     355      remove_tree($dir);
     356      print "Done\n";
     357    }
     358  }
     359}
     360## clearExistingDirectory()
    192361
    193362## @function
     
    229398    $flush_delay--;
    230399  }
    231   print "Done\n\n";
     400  print "Done\n";
    232401}
    233402## flushDiskCache()
     
    257426    my $gslog_path = $logs_path . '/import.log';
    258427    open(GSLOGOUT, '>:utf8', $gslog_path) or die("Error! Failed to open file for writing: " . $gslog_path);
    259     $logging_enabled = 1;
     428    #$logging_enabled = 1;
    260429  }
    261430
     
    278447        print STRACEOUT $line;
    279448      }
     449
     450      if ($line =~ /^[^\d\[]+(.*)/)
     451      {
     452        $line = $1;
     453      }
     454
    280455      # we may have a line that unfortunately includes newlines in its arguments list
    281456      # - watch out for unfinished syscalls that will be resumed later
     
    337512        # limit the resumed calls duration to, at most, the time difference
    338513        # between this syscall and the next. This is highly inaccurate, of
    339         # course, as it includes time spent in userspace but is significantly
    340         # better than a elasped duration several times longer than the syscall
    341         # actually took.
     514        # course, as it excludes time spent before the suspension but then
     515        # includes time spent in userspace but is significantly better than a
     516        # elasped duration several times longer than the syscall actually took.
    342517        if ($next_line =~ /^.*?(\d+\.\d+)/)
    343518        {
     
    363538        if ($syscall eq 'write' && $args_prefix =~ /^[12],/)
    364539        {
    365           print "\n\nPending write pid:$pid line:|" . $line . "|\n";
     540          #print "\n\nPending write pid:$pid line:|" . $line . "|\n";
    366541          if (!defined $interrupted_stacks->{$pid})
    367542          {
     
    379554      }
    380555      # - exit_group never has a duration
    381       elsif ($line =~ /^(\d+\.\d+)\s+exit_group\((\d+)\)\s+=\s+\?$/)
     556      elsif ($line =~ /^(\d+\.\d+)\s+(_exit|exit_group)\((\d+)\)\s+=\s+\?$/)
    382557      {
    383558        my $timestamp = $1;
    384         my $exit_value = $2;
     559        my $exit_function = $2;
     560        my $exit_value = $3;
    385561        $end_timestamp = $timestamp;
    386562        if ($debug)
     
    388564          print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t";
    389565          print TSVOUT $pid . "\t";
    390           print TSVOUT "exit_group\t";
     566          print TSVOUT $exit_function . "\t";
    391567          print TSVOUT "0.000000\t";
    392568          print TSVOUT sprintf("%0.6f", $misc_duration) . "\t";
     
    510686}
    511687
     688## @function
     689#
     690sub execSQL
     691{
     692  my ($db_path, $sql) = @_;
     693  # call getValueSQL but don't care about result
     694  getValueSQL($db_path, $sql);
     695}
     696# /** execSQL() **/
     697
     698
     699## @function
     700#
     701sub getRecordSQL
     702{
     703  my ($db_path, $sql) = @_;
     704  if ($sql !~ /LIMIT 1/i)
     705  {
     706    $sql .= ' LIMIT 1';
     707  }
     708  my $value = getValueSQL($db_path, $sql);
     709  return split(/\|/,$value);
     710}
     711# /** getRecordSQL() **/
     712
     713
     714## @function
     715#
     716sub getRecordsSQL
     717{
     718  my ($db_path, $sql) = @_;
     719  my @records;
     720  my $raw_values = getValueSQL($db_path, $sql);
     721  foreach my $raw_record (split(/\r?\n/, $raw_values))
     722  {
     723    my @record = split(/\|/, $raw_record);
     724    push(@records, \@record);
     725  }
     726  return @records;
     727}
     728## getRecordsSQL()
     729
     730
     731## @function
     732#
     733sub getValueSQL
     734{
     735  my ($db_path, $sql) = @_;
     736  my $result = `sqlite3 "$db_path" "$sql" 2>&1`;
     737  if ($result =~ /Error:/)
     738  {
     739    die("Fatal Error!\nSQL:" . $sql . "\nMsg:" . $result);
     740  }
     741  # trim
     742  $result =~ s/^\s*|\s*$//g;
     743  return $result;
     744}
     745# /** getValueSQL() **/
     746
     747
     748## @function printReport()
     749#
     750sub printReport
     751{
     752  my ($db_path) = @_;
     753  # get listing of unique collection names sorted alphabetically.  While we are
     754  # at it, grab the number of documents processed from pretty much any entry
     755  # for this collection (they should all be the same, otherwise they would've
     756  # been repeated back in testing)
     757  my $collection_sql = 'SELECT collection, AVG(docs_processed) FROM tests GROUP BY collection';
     758  my @collection_records = &getRecordsSQL($db_path, $collection_sql);
     759  foreach my $collection_record (@collection_records)
     760  {
     761    my ($collection, $docs_processed) = @{$collection_record};
     762    # determine the size in bytes of the import directory
     763    my $collection_import_directory = $ENV{'GSDLHOME'} . '/collect/' . $collection . '/import';
     764    my $du_command = 'du -bsL "' . $collection_import_directory . '"';
     765    my $du_result = `$du_command`;
     766    my $size_in_bytes = 0;
     767    if ($du_result =~ /^(\d+)\s+/)
     768    {
     769      $size_in_bytes = $1;
     770    }
     771    # pretty print header block for dokuwiki
     772    &printReportHeader($collection, $docs_processed, $size_in_bytes);
     773    my @test_records = &getRecordsSQL($db_path, 'SELECT * FROM tests WHERE collection=\'' . $collection . '\' AND test_started > 0 ORDER BY test_started');
     774    my $number_of_tests = scalar(@test_records);
     775    my $total_b_elapsed = 0;
     776    my @b_elapsed_values;
     777    my $total_b_io_percent = 0;
     778    my @b_io_percent_values;
     779    my $total_b_other_percent = 0;
     780    my @b_other_percent_values;
     781    my $total_d_elapsed = 0;
     782    my @d_elapsed_values;
     783    my $total_d_io_percent = 0;
     784    my @d_io_percent_values;
     785    my $total_d_other_percent = 0;
     786    my @d_other_percent_values;
     787    foreach my $test_record (@test_records)
     788    {
     789      my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record};
     790      my $b_io_percent = ($b_io / $b_elapsed) * 100;
     791      my $b_other_percent = ($b_other / $b_elapsed) * 100;
     792      my $d_elapsed = $i_elapsed - $b_elapsed;
     793      my $d_io = $i_io - $b_io;
     794      my $d_io_percent = ($d_io / $d_elapsed) * 100;
     795      my $d_other = $i_other - $b_other;
     796      my $d_other_percent = ($d_other / $d_elapsed) * 100;
     797      # add to running totals
     798      $total_b_elapsed += $b_elapsed;
     799      push(@b_elapsed_values, $b_elapsed);
     800      $total_b_io_percent += $b_io_percent;
     801      push(@b_io_percent_values, $b_io_percent);
     802      $total_b_other_percent += $b_other_percent;
     803      push(@b_other_percent_values, $b_other_percent);
     804      $total_d_elapsed += $d_elapsed;
     805      push(@d_elapsed_values, $d_elapsed);
     806      $total_d_io_percent += $d_io_percent;
     807      push(@d_io_percent_values, $d_io_percent);
     808      $total_d_other_percent += $d_other_percent;
     809      push(@d_other_percent_values, $d_other_percent);
     810    }
     811    # Averages
     812    my $b_elapsed_avg = $total_b_elapsed / $number_of_tests;
     813    my $b_io_percent_avg = $total_b_io_percent / $number_of_tests;
     814    my $b_other_percent_avg = $total_b_other_percent / $number_of_tests;
     815    my $d_elapsed_avg = $total_d_elapsed / $number_of_tests;
     816    my $d_io_percent_avg = $total_d_io_percent / $number_of_tests;
     817    my $d_other_percent_avg = $total_d_other_percent / $number_of_tests;
     818    # Standard Deviations
     819    my $b_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $b_elapsed_avg, @b_elapsed_values);
     820    my $b_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_io_percent_avg, @b_io_percent_values);
     821    my $b_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_other_percent_avg, @b_other_percent_values);
     822    my $d_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $d_elapsed_avg, @d_elapsed_values);
     823    my $d_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_io_percent_avg, @d_io_percent_values);
     824    my $d_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_other_percent_avg, @d_other_percent_values);
     825
     826    my $counter = 0;
     827    my $outlier_count = 0;
     828    foreach my $test_record (@test_records)
     829    {
     830      my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record};
     831      $counter++;
     832      my $b_io_percent = ($b_io / $b_elapsed) * 100;
     833      my $b_other_percent = ($b_other / $b_elapsed) * 100;
     834      my $d_elapsed = $i_elapsed - $b_elapsed;
     835      my $d_io = $i_io - $b_io;
     836      my $d_io_percent = ($d_io / $d_elapsed) * 100;
     837      my $d_other = $i_other - $b_other;
     838      my $d_other_percent = ($d_other / $d_elapsed) * 100;
     839      print '|  ' . $counter . ' |  ' . $the_collection . '  |  ' . $docs_processed . ' |  ' . $size_in_bytes . ' |  ' . $test_started . ' |  ' . sprintf('%0.6f', $b_elapsed) . ' |  ' . sprintf('%0.1f', $b_io_percent) . '% |  ' . sprintf('%0.1f', $b_other_percent) . '% |  ' . sprintf('%0.6f', $d_elapsed) . ' |  ' . sprintf('%0.1f', $d_io_percent) . '% |  ' . sprintf('%0.1f', $d_other_percent) . '% |  ';
     840      if (&isOutlier($d_io_percent, $d_io_percent_avg, $d_io_percent_stddev))
     841      {
     842        print 'Yes';
     843        $outlier_count++;
     844      }
     845      else
     846      {
     847        print 'No';
     848      }
     849      print "  |\n";
     850    }
     851    print '^  Average |||||  ' . sprintf('%0.6f', $b_elapsed_avg) . ' |  ' . sprintf('%0.1f', $b_io_percent_avg) . '% |  ' . sprintf('%0.1f', $b_other_percent_avg) . '% |  ' .  sprintf('%0.6f', $d_elapsed_avg) . ' |  ' . sprintf('%0.1f', $d_io_percent_avg) . '% |  ' . sprintf('%0.1f', $d_other_percent_avg) . "% |\n";
     852    print '^  Standard Deviation |||||  ' . sprintf('%0.6f', $b_elapsed_stddev) . ' |  ' . sprintf('%0.1f', $b_io_percent_stddev) . '% |  ' . sprintf('%0.1f', $b_other_percent_stddev) . '% |  ' . sprintf('%0.6f', $d_elapsed_stddev) . ' |  ' . sprintf('%0.1f', $d_io_percent_stddev) . '% |  ' . sprintf('%0.1f', $d_other_percent_stddev) . '% |  ' . $outlier_count . " |\n";
     853    print "\n";
     854  }
     855}
     856## printReport()
     857
     858
     859## @function printReportHeader($collection)
     860#
     861sub printReportHeader
     862{
     863  my ($collection) = @_;
     864  print '==== "' . $collection . "\" Collection ====\n\n";
     865  print "^ Count  ^ Collection  ^ NumDocs  ^ Size  ^ Timestamp  ^  Base                    ^^^  Import                 ^^^ Outlier?  ^\n";
     866  print "^ :::    ^ :::         ^ :::      ^ :::   ^ :::        ^ Elapsed  ^  Syscalls      ^^ Elapsed  ^  Syscalls     ^^ :::       ^\n";
     867  print "^ :::    ^ :::         ^ :::      ^ :::   ^ :::        ^ :::      ^ I/O%  ^ Other%  ^ :::      ^ I/O%  ^ Other  ^ :::       ^\n";
     868}
     869## printReportHeader()
     870
     871
     872## @function calculateStandardDeviation($mean, $value1 ... $valueN)
     873#  Our sample size is the population so we can use the simplier standard deviation equation for an entire population (now, theres some rap lyrics)
     874#  stddev = squareroot ( sum ( square( value - mean ) ) / count )
     875sub calculateStandardDeviation
     876{
     877  my $population = shift(@_);
     878  my $mean = shift(@_);
     879  my $count = scalar(@_);
     880  my $total = 0;
     881  # sum of squares
     882  foreach my $value (@_)
     883  {
     884    my $subtotal = $value - $mean;
     885    $total += $subtotal * $subtotal;
     886  }
     887  if ($population == $count)
     888  {
     889    $total = $total / $count;
     890  }
     891  else
     892  {
     893    $total = $total / ($count - 1);
     894  }
     895  my $standard_deviation = sqrt($total);
     896  return $standard_deviation;
     897}
     898## calculateStandardDeviation()
     899
     900
     901## @function isOutlier($value, $mean, $stddev)
     902#
     903sub isOutlier
     904{
     905  my ($value, $mean, $stddev) = @_;
     906  my $min = $mean - (2 * $stddev);
     907  my $max = $mean + (2 * $stddev);
     908  return ($value <= $min || $max <= $value);
     909}
     910## isOutlier()
     911
    5129121;
Note: See TracChangeset for help on using the changeset viewer.