Context Navigation

← Previous Change
Next Change →

Changeset 28767 for gs2-extensions/parallel-building

Timestamp:

2013-12-18T10:21:53+13:00 (10 years ago)

Author:

jmt12

Message:

Drastically increased the script to allow 1) battery of imports backed by database of tests, 2) printing of reported in dokuwiki format, and 3) calculation of statistical information (STDDEV) and outliers (although I don't do anything with that information... yet)

File:

: 1 edited

gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl (modified) (14 diffs)

Legend:

: Unmodified
: Added
: Removed

gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl

-              r28665
+              r28767
 use warnings;
 # Libraries
 use File::Path qw( remove_tree );
 use File::Temp qw( tempdir );
+use File::Path   qw( remove_tree );
+use File::Temp   qw( tempdir );
 use Getopt::Long;
+use Time::HiRes qw( gettimeofday tv_interval );
+use POSIX        qw( strftime );
+use Time::HiRes  qw( gettimeofday tv_interval );
 BEGIN
+{
   print "======================= Greenstone Import + I/O Metrics ======================\n\n";
+  print "\n======================= Greenstone Import + I/O Metrics ======================\n\n";
   if (!defined $ENV{'GSDLHOME'})
+  {
 …
+  }
+}
+print "Prepare and run a number of collection imports while recording I/O metrics.\n\n";
 # 1. Parse and sanitize arguments - the listing of syscalls whose duration we
 …
 # strace_analyzer.pl would use
 print " * Initializing... Done\n";
+my $machine_name = `hostname -s`;
+chomp($machine_name);
+$machine_name = ucfirst($machine_name);
+my $os_name = `lsb_release -i`;
+$os_name =~ s/^Distributor ID:\s+(.*)\r?\n$/$1/i;
+my $fs_name = `df -T $ENV{'GSDLHOME'}`;
+$fs_name =~ s/^.*(ext2|ext3|ext4|xfs|zfs).*$/$1/is;
+$fs_name = uc($fs_name);
+my $start_time = [gettimeofday()];
+my @collections;
+my $print_report = 0;
 our $strace_flags = '-f -q -s 256 -T -ttt';
 our $io_function_list = {
 …
                          'ftell'=>1,
                          'getdents'=>1,
                          #'ioctl'=>1, # John added
+                         'ioctl'=>1, # John added
                          'llseek'=>1,
                          'lockf'=>1,
 …
                          'write'=>1
                         };
+my $start_time = [gettimeofday()];
+my $collection = '';
+my $debug = 0;
+my $flush_delay = 3;
+GetOptions ('collection=s' => \$collection,
+our $debug = 0;
+our $flush_delay = 3;
+our $test_runs = 9;
+GetOptions ('collection=s' => \@collections,
             'debug' => \$debug,
+            'flushdelay=i' => \$flush_delay)
+            'flushdelay=i' => \$flush_delay,
+            'runs=i' => \$test_runs,
+            'report' => \$print_report)
 or &printUsage('Invalid or missing argument');
+if (!defined $collection || $collection eq '')
+{
+  &printUsage('Missing collection name');
+}
+my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection;
+if (!-d $collection_path)
+{
+  &printUsage('Collection not found: ' . $collection_path);
+}
+print '   - Greenstone:  ' . $ENV{'GSDLHOME'} . "\n";
+print '   - Collection:  ' . $collection . "\n";
+print '   - Flush Delay: ' . $flush_delay . "\n";
+print '   - Debug?       ' . ($debug ? 'Yes' : 'No') . "\n";
+if ($test_runs < 1)
+{
+  &printUsage('Test runs must be non-zeo');
+}
+my $db_name = 'strace_' . $machine_name . '_' . $os_name . '_' . $fs_name . '.sqlite3db';
+my $db_path = $ENV{'GSDLHOME'} . '/collect/' . $db_name;
+if (0 < $print_report)
+{
+  &printReport($db_path);
+  exit;
+}
+if (0 == scalar(@collections))
+{
+  &printUsage('Missing collection or collections');
+}
+print '   - Greenstone:   ' . $ENV{'GSDLHOME'} . "\n";
+print '   - Collections:  ' . join(',', @collections) . "\n";
+print '   - Test Runs:    ' . $test_runs . "\n";
+print '   - Flush Delay:  ' . $flush_delay . "\n";
+print '   - Debug?        ' . ($debug ? 'Yes' : 'No') . "\n";
 print "\n";
+# 2. Run dummy import command (empty import directory) within trace to
+# determine baseline
+&clearArchives();
+&flushDiskCache($flush_delay);
+print " * Running baseline collection import... ";
+my $dummy_dir = tempdir( CLEANUP => 1);
+my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1';
+my $debug_path1 = '';
+if ($debug)
+{
+  $debug_path1 = $collection_path . '/debug-baseline.tsv';
+}
+my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1);
+print "Done\n";
+print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n";
+print '   - SysCall Time:  ' . sprintf('%0.6f', $baseline_io + $baseline_misc) . " seconds\n";
+print '   - SysCall Prct:  ' . sprintf('%d', (($baseline_io + $baseline_misc) / $baseline_duration) * 100) . "%\n";
+print '   - I/O Time:      ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
+print '   - I/O Percent:   ' . sprintf('%d', ($baseline_io / $baseline_duration) * 100) . "%\n";
+if ($debug)
+{
+  print "   - See 'debug-baseline.tsv' for raw data\n";
+}
+# 0. Create database a populate with tests (if necessary)
+if (!-f $db_path)
+{
+  print STDOUT ' * Creating database tables... ';
+  execSQL($db_path, 'CREATE TABLE IF NOT EXISTS tests (collection TEXT, test_run INTEGER, test_started INTEGER DEFAULT 0, docs_found INTEGER, docs_processed INTEGER, b_elapsed REAL DEFAULT 0, b_io REAL DEFAULT 0, b_other REAL DEFAULT 0, i_elapsed REAL DEFAULT 0, i_io REAL DEFAULT 0, i_other REAL DEFAULT 0, PRIMARY KEY (collection, test_run))');
+  print STDOUT "Done\n";
+}
+print ' * Populating database with test runs as necessary... ';
+my $new_test_count = 0;
+foreach my $collection (@collections)
+{
+  for (my $i = 1; $i <= $test_runs; $i++)
+  {
+    my $search_sql = "SELECT COUNT(*) FROM tests WHERE collection='" . $collection . "' AND test_run=" . $i;
+    my $test_count = &getValueSQL($db_path, $search_sql);
+    if (0 == $test_count)
+    {
+      $new_test_count++;
+      my $insert_sql = "INSERT INTO tests (collection, test_run) VALUES ('" . $collection . "'," . $i . ")";
+      &execSQL($db_path, $insert_sql);
+    }
+  }
+}
+print $new_test_count . " tests added\n";
 print "\n";
+# 3. Run normal import command within strace reading output line by line
+&clearArchives();
+&flushDiskCache($flush_delay);
+print " * Running import and tracking I/O metrics... ";
+my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1';
+my $debug_path2 = '';
+if ($debug)
+{
+  $debug_path2 = $collection_path . '/debug-import.tsv';
+}
+my ($duration_import, $duration_io, $duration_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
+print "Done\n";
+print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n";
+print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
+print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
+print '   - Duration:      ' . sprintf('%0.6f', $duration_import) . " seconds\n";
+print '   - SysCall Time:  ' . sprintf('%0.6f', $duration_io + $duration_misc) . " seconds\n";
+print '   - SysCall Prct:  ' . sprintf('%d', (($duration_io + $duration_misc) / $duration_import) * 100) . "%\n";
+print '   - I/O Duration:  ' . sprintf('%0.6f', $duration_io) . " seconds\n";
+print '   - I/O Percent:   ' . sprintf('%d', ($duration_io / $duration_import) * 100) . "%\n";
+print "   - See 'import.log' for Greenstone Import details\n";
+print "   - See 'strace.out' for STrace details\n";
+if ($debug)
+{
+  print "   - See 'debug-import.tsv' for raw data\n";
+}
+print "\n";
+# 4. Results
+print &makeHeader('Import Results', 78) . "\n\n";
+print " Import Duration:  " . sprintf('%0.6f', ($duration_import - $baseline_duration)) . " seconds\n";
+print " I/O Duration:     " . sprintf('%0.6f', ($duration_io - $baseline_io)) . " seconds\n";
+print " I/O Percentage:   " . sprintf('%d', ((($duration_io - $baseline_io)/($duration_import - $baseline_duration)) * 100)) . "%\n";
+print "\n";
+# Complete!
+# 1. Continue picking random tests and running them until there are none left
+my $total_test_count = &getValueSQL($db_path, 'SELECT COUNT(*) FROM tests');
+my $remaining_test_sql = 'SELECT COUNT(*) FROM tests WHERE test_started=0';
+my $remaining_test_count = getValueSQL($db_path, $remaining_test_sql);
+my $exit_file_path = $ENV{'GSDLHOME'} . '/collect/exit.now';
+while ($remaining_test_count ne "0" && !-f $exit_file_path)
+{
+  my $test_started = time;
+  my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime($test_started);
+  my $random_test_sql = 'SELECT collection, test_run FROM tests WHERE test_started=0 ORDER BY RANDOM() LIMIT 1';
+  my ($collection, $test_run) = &getRecordSQL($db_path, $random_test_sql);
+  print &makeHeader('Running test: ' . $collection . '#' . $test_run, 78) . "\n\n";
+  print " * Note: to gracefully exit create a file: <gsdlhome>/collect/exit.now\n";
+  print ' * Started: ' . $now_string . "\n";
+  my $found_result = 0;
+  do
+  {
+    my @results = &importWithStrace($collection);
+    # We don't allow negative results... just try again
+    if ($results[0] != $results[1])
+    {
+      print "Warning! Failed to process all the documents found - rerunning test...\n";
+    }
+    elsif (0 >= $results[1])
+    {
+      print "Warning! Failed to process any documents - rerunning test...\n";
+    }
+    # I'm not sure how this happens, but it seems on really fast imports they
+    # sometimes happen faster than the baseline import?!?
+    elsif ($results[5] < $results[2])
+    {
+      print "Warning! Baseline import took longer than actual import - rerunning test...\n";
+    }
+    elsif ($results[6] < $results[3])
+    {
+      print "Warning! Spent more time on I/O syscalls in baseline than during import - rerunning test...\n";
+    }
+    elsif ($results[7] < $results[4])
+    {
+      print "Warning! Spent more time on non-I/O syscalls in baseline than during import - rerunning test...\n";
+    }
+    # I'm also seeing a lot of negative percentages caused by the total delta
+    # time spend on io and other syscalls being more than the delta time
+    # between import and baseline durations
+    elsif (($results[5] - $results[2]) < (($results[6] - $results[3]) + ($results[7] - $results[4])))
+    {
+      print "Warning! Time difference between baseline and import durations less than total time differences between io and misc system calls - rerunning test...\n";
+    }
+    else
+    {
+      my $update_sql = 'UPDATE tests SET test_started=' . $test_started . ', ';
+      $update_sql .= 'docs_found=' . $results[0] . ', docs_processed=' . $results[1] . ', ';
+      $update_sql .= 'b_elapsed=' . sprintf('%0.6f', $results[2]) . ', ';
+      $update_sql .= 'b_io=' . sprintf('%0.6f', $results[3]) . ', ';
+      $update_sql .= 'b_other=' . sprintf('%0.6f', $results[4]) . ', ';
+      $update_sql .= 'i_elapsed=' . sprintf('%0.6f', $results[5]) . ', ';
+      $update_sql .= 'i_io=' . sprintf('%0.6f', $results[6]) . ', ';
+      $update_sql .= 'i_other=' . sprintf('%0.6f', $results[7]) . ' ';
+      $update_sql .= 'WHERE collection=\'' . $collection . '\' AND test_run=' . $test_run;
+      execSQL($db_path, $update_sql);
+      $found_result = 1;
+    }
+  }
+  while(0 == $found_result);
+  # Repeat until we have exhausted pending tests
+  $remaining_test_count = &getValueSQL($db_path, $remaining_test_sql);
+  my $x = $total_test_count - $remaining_test_count;
+  print &makeHeader('Test Complete: ' . $collection . '#' . $test_run, 78) . "\n\n";
+  print ' * ' . sprintf("%.0f",(($x/$total_test_count)*100)) . "% of all tests complete!\n";
+  print ' * ' . $remaining_test_count . " tests remaining\n";
+  print "\n";
+}
+if (-f $exit_file_path)
+{
+  unlink($exit_file_path);
+}
+# 2. Complete!
 my $end_time = [gettimeofday()];
 my $duration = tv_interval($start_time, $end_time);
 …
 exit;
+################################################################################
+## @function importWithStrace()
+#
+sub importWithStrace
+{
+  my ($collection) = @_;
+  my $collection_path = $ENV{'GSDLHOME'} . '/collect/' . $collection;
+  if (!-d $collection_path)
+  {
+    &printError('Collection not found: ' . $collection_path);
+  }
+  # 2. Run dummy import command (empty import directory) within trace to
+  # determine baseline
+  &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp');
+  &flushDiskCache($flush_delay);
+  print " * Running baseline collection import... ";
+  my $dummy_dir = tempdir( CLEANUP => 1);
+  my $command1 = 'strace ' . $strace_flags . ' import.pl -removeold -importdir "' . $dummy_dir .'" "' . $collection . '" 2>&1';
+  my $debug_path1 = '';
+  if ($debug)
+  {
+    $debug_path1 = $collection_path . '/debug-baseline.tsv';
+  }
+  my ($baseline_duration, $baseline_io, $baseline_misc) = &parseStrace($command1, $collection_path, $debug_path1);
+  print "Done\n";
+  print '   - Duration:      ' . sprintf('%0.6f', $baseline_duration) . " seconds\n";
+  print "   - System Calls Breakdown:\n";
+  print '     - I/O Duration:  ' . sprintf('%0.6f', $baseline_io) . " seconds\n";
+  print '     - I/O Percent:   ' . sprintf('%0.1f', ($baseline_io / $baseline_duration) * 100) . "%\n";
+  print '     - Other Duratn:  ' . sprintf('%0.6f', $baseline_misc) . " seconds\n";
+  print '     - Other Percnt:  ' . sprintf('%0.1f', ($baseline_misc / $baseline_duration) * 100) . "%\n";
+  if ($debug)
+  {
+    print "   - See 'debug-baseline.tsv' for raw data\n";
+  }
+  if (-d $dummy_dir)
+  {
+    rmdir($dummy_dir);
+  }
+  # 3. Run normal import command within strace reading output line by line
+  &clearExistingDirectory($collection_path, 'archives', 'cached', 'logs', 'tmp');
+  &flushDiskCache($flush_delay);
+  print " * Running import and tracking I/O metrics... ";
+  my $command2 = 'strace ' . $strace_flags . ' import.pl -removeold "' . $collection . '" 2>&1';
+  my $debug_path2 = '';
+  if ($debug)
+  {
+    $debug_path2 = $collection_path . '/debug-import.tsv';
+  }
+  my ($import_duration, $import_io, $import_misc, $import_complete, $docs_found, $docs_processed) = &parseStrace($command2, $collection_path, $debug_path2);
+  print "Done\n";
+  print '   - Import?        ' . ( $import_complete ? 'Completed' : 'Failed') . "\n";
+  print '   - Found:         ' . $docs_found . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
+  print '   - Processed:     ' . $docs_processed . ' document' . (($docs_processed > 1) ? 's' : '') . "\n";
+  print '   - Duration:      ' . sprintf('%0.6f', $import_duration) . " seconds\n";
+  print "   - System Calls Breakdown:\n";
+  print '     - I/O Duration:  ' . sprintf('%0.6f', $import_io) . " seconds\n";
+  print '     - I/O Percent:   ' . sprintf('%0.1f', ($import_io / $import_duration) * 100) . "%\n";
+  print '     - Other Duratn:  ' . sprintf('%0.6f', $import_misc) . " seconds\n";
+  print '     - Other Percnt:  ' . sprintf('%0.1f', ($import_misc / $import_duration) * 100) . "%\n";
+  print "   - See 'import.log' for Greenstone Import details\n";
+  print "   - See 'strace.out' for STrace details\n";
+  if ($debug)
+  {
+    print "   - See 'debug-import.tsv' for raw data\n";
+  }
+  print "\n";
+  # 4. Results
+  my $delta_duration = $import_duration - $baseline_duration;
+  my $delta_io = $import_io - $baseline_io;
+  my $delta_io_percent = ( $delta_io / $delta_duration ) * 100;
+  my $delta_misc = $import_misc - $baseline_misc;
+  my $delta_misc_percent = ( $delta_misc / $delta_duration ) * 100;
+  print &makeHeader('Import Results', 78) . "\n\n";
+  print ' Import Duration:  ' . sprintf('%0.6f', $delta_duration) . " seconds\n";
+  print " System Calls Breakdown:\n";
+  print ' - I/O Duration:  ' . sprintf('%0.6f', $delta_io) . " seconds\n";
+  print ' - I/O Percent:   ' . sprintf('%0.1f', $delta_io_percent) . "%\n";
+  print ' - Other Duratn:  ' . sprintf('%0.6f', $delta_misc) . " seconds\n";
+  print ' - Other Percnt:  ' . sprintf('%0.1f', $delta_misc_percent) . "%\n";
+  print "\n";
+  return ($docs_found, $docs_processed, $baseline_duration, $baseline_io, $baseline_misc, $import_duration, $import_io, $import_misc);
+}
+## importWithStrace()
+## @function makeHeader($msg, [$length])
+#
+# Create a centered header string given a certain message padded with '=' characters.
+#
+# @param $msg The message to center as a string
+# @param $length The desired length of string - defaults to 79
+# @return A string centered with '=' as padding
+#
+sub makeHeader
+{
+  my ($msg, $length) = @_;
+  if (!defined $length)
+  {
+    $length = 79; # 80 with newline
+  }
+  if (length($msg) % 2 == 0)
+  {
+    $msg .= ' ';
+  }
+  my $filler_length = ($length - length($msg)) / 2;
+  my $filler = '=' x $filler_length;
+  $msg = $filler . ' ' . $msg . ' ' . $filler;
+  return $msg;
+}
+## makeHeader() ##
 ## @function
+#
+sub makeHeader
+{
+  my ($msg, $length) = @_;
+  if (length($msg) % 2 == 1)
+  {
+    $msg = ' ' . $msg . '  ';
+  }
+  else
+  {
+    $msg = ' ' . $msg . ' ';
+  }
+  while (length($msg) < $length)
+  {
+    $msg = '=' . $msg . '=';
+  }
+  return $msg;
+}
+## makeHeader() ##
+## @function
+#
+sub clearArchives
+{
+  # 2. Remove any existing archives directory, so it doesn't get factored in the
+  # IO costs
+  my $archives_dir = $collection_path . '/archives';
+  if (-d $archives_dir)
+  {
+    print " * Deleting existing archives directory... ";
+    remove_tree($archives_dir);
+    print "Done\n";
+  }
+}
+## clearArchives()
+#  Remove named Greenstone directory so it doesn't get factored in I/O costs
+#
+sub clearExistingDirectory
+{
+  my $collection_path = shift(@_);
+  for my $dirname (@_)
+  {
+    my $dir = $collection_path . '/' . $dirname;
+    if (-d $dir)
+    {
+      print ' * Deleting existing ' . $dirname . ' directory... ';
+      remove_tree($dir);
+      print "Done\n";
+    }
+  }
+}
+## clearExistingDirectory()
 ## @function
 …
     $flush_delay--;
+  }
   print "Done\n\n";
+  print "Done\n";
+}
 ## flushDiskCache()
 …
     my $gslog_path = $logs_path . '/import.log';
     open(GSLOGOUT, '>:utf8', $gslog_path) or die("Error! Failed to open file for writing: " . $gslog_path);
     $logging_enabled = 1;
+    #$logging_enabled = 1;
+  }
 …
         print STRACEOUT $line;
+      }
+      if ($line =~ /^[^\d\[]+(.*)/)
+      {
+        $line = $1;
+      }
       # we may have a line that unfortunately includes newlines in its arguments list
       # - watch out for unfinished syscalls that will be resumed later
 …
         # limit the resumed calls duration to, at most, the time difference
         # between this syscall and the next. This is highly inaccurate, of
         # course, as it includes time spent in userspace but is significantly
         # better than a elasped duration several times longer than the syscall
         # actually took.
+        # course, as it excludes time spent before the suspension but then
+        # includes time spent in userspace but is significantly better than a
+        # elasped duration several times longer than the syscall actually took.
         if ($next_line =~ /^.*?(\d+\.\d+)/)
+        {
 …
         if ($syscall eq 'write' && $args_prefix =~ /^[12],/)
+        {
           print "\n\nPending write pid:$pid line:|" . $line . "|\n";
+          #print "\n\nPending write pid:$pid line:|" . $line . "|\n";
           if (!defined $interrupted_stacks->{$pid})
+          {
 …
+      }
       # - exit_group never has a duration
       elsif ($line =~ /^(\d+\.\d+)\s+exit_group\((\d+)\)\s+=\s+\?$/)
+      elsif ($line =~ /^(\d+\.\d+)\s+(_exit|exit_group)\((\d+)\)\s+=\s+\?$/)
+      {
         my $timestamp = $1;
+        my $exit_value = $2;
+        my $exit_function = $2;
+        my $exit_value = $3;
         $end_timestamp = $timestamp;
         if ($debug)
 …
           print TSVOUT sprintf("%0.6f", ($timestamp - $start_timestamp)) . "\t";
           print TSVOUT $pid . "\t";
           print TSVOUT "exit_group\t";
+          print TSVOUT $exit_function . "\t";
           print TSVOUT "0.000000\t";
           print TSVOUT sprintf("%0.6f", $misc_duration) . "\t";
 …
+}
+## @function
+#
+sub execSQL
+{
+  my ($db_path, $sql) = @_;
+  # call getValueSQL but don't care about result
+  getValueSQL($db_path, $sql);
+}
+# /** execSQL() **/
+## @function
+#
+sub getRecordSQL
+{
+  my ($db_path, $sql) = @_;
+  if ($sql !~ /LIMIT 1/i)
+  {
+    $sql .= ' LIMIT 1';
+  }
+  my $value = getValueSQL($db_path, $sql);
+  return split(/\|/,$value);
+}
+# /** getRecordSQL() **/
+## @function
+#
+sub getRecordsSQL
+{
+  my ($db_path, $sql) = @_;
+  my @records;
+  my $raw_values = getValueSQL($db_path, $sql);
+  foreach my $raw_record (split(/\r?\n/, $raw_values))
+  {
+    my @record = split(/\|/, $raw_record);
+    push(@records, \@record);
+  }
+  return @records;
+}
+## getRecordsSQL()
+## @function
+#
+sub getValueSQL
+{
+  my ($db_path, $sql) = @_;
+  my $result = `sqlite3 "$db_path" "$sql" 2>&1`;
+  if ($result =~ /Error:/)
+  {
+    die("Fatal Error!\nSQL:" . $sql . "\nMsg:" . $result);
+  }
+  # trim
+  $result =~ s/^\s*|\s*$//g;
+  return $result;
+}
+# /** getValueSQL() **/
+## @function printReport()
+#
+sub printReport
+{
+  my ($db_path) = @_;
+  # get listing of unique collection names sorted alphabetically.  While we are
+  # at it, grab the number of documents processed from pretty much any entry
+  # for this collection (they should all be the same, otherwise they would've
+  # been repeated back in testing)
+  my $collection_sql = 'SELECT collection, AVG(docs_processed) FROM tests GROUP BY collection';
+  my @collection_records = &getRecordsSQL($db_path, $collection_sql);
+  foreach my $collection_record (@collection_records)
+  {
+    my ($collection, $docs_processed) = @{$collection_record};
+    # determine the size in bytes of the import directory
+    my $collection_import_directory = $ENV{'GSDLHOME'} . '/collect/' . $collection . '/import';
+    my $du_command = 'du -bsL "' . $collection_import_directory . '"';
+    my $du_result = `$du_command`;
+    my $size_in_bytes = 0;
+    if ($du_result =~ /^(\d+)\s+/)
+    {
+      $size_in_bytes = $1;
+    }
+    # pretty print header block for dokuwiki
+    &printReportHeader($collection, $docs_processed, $size_in_bytes);
+    my @test_records = &getRecordsSQL($db_path, 'SELECT * FROM tests WHERE collection=\'' . $collection . '\' AND test_started > 0 ORDER BY test_started');
+    my $number_of_tests = scalar(@test_records);
+    my $total_b_elapsed = 0;
+    my @b_elapsed_values;
+    my $total_b_io_percent = 0;
+    my @b_io_percent_values;
+    my $total_b_other_percent = 0;
+    my @b_other_percent_values;
+    my $total_d_elapsed = 0;
+    my @d_elapsed_values;
+    my $total_d_io_percent = 0;
+    my @d_io_percent_values;
+    my $total_d_other_percent = 0;
+    my @d_other_percent_values;
+    foreach my $test_record (@test_records)
+    {
+      my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record};
+      my $b_io_percent = ($b_io / $b_elapsed) * 100;
+      my $b_other_percent = ($b_other / $b_elapsed) * 100;
+      my $d_elapsed = $i_elapsed - $b_elapsed;
+      my $d_io = $i_io - $b_io;
+      my $d_io_percent = ($d_io / $d_elapsed) * 100;
+      my $d_other = $i_other - $b_other;
+      my $d_other_percent = ($d_other / $d_elapsed) * 100;
+      # add to running totals
+      $total_b_elapsed += $b_elapsed;
+      push(@b_elapsed_values, $b_elapsed);
+      $total_b_io_percent += $b_io_percent;
+      push(@b_io_percent_values, $b_io_percent);
+      $total_b_other_percent += $b_other_percent;
+      push(@b_other_percent_values, $b_other_percent);
+      $total_d_elapsed += $d_elapsed;
+      push(@d_elapsed_values, $d_elapsed);
+      $total_d_io_percent += $d_io_percent;
+      push(@d_io_percent_values, $d_io_percent);
+      $total_d_other_percent += $d_other_percent;
+      push(@d_other_percent_values, $d_other_percent);
+    }
+    # Averages
+    my $b_elapsed_avg = $total_b_elapsed / $number_of_tests;
+    my $b_io_percent_avg = $total_b_io_percent / $number_of_tests;
+    my $b_other_percent_avg = $total_b_other_percent / $number_of_tests;
+    my $d_elapsed_avg = $total_d_elapsed / $number_of_tests;
+    my $d_io_percent_avg = $total_d_io_percent / $number_of_tests;
+    my $d_other_percent_avg = $total_d_other_percent / $number_of_tests;
+    # Standard Deviations
+    my $b_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $b_elapsed_avg, @b_elapsed_values);
+    my $b_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_io_percent_avg, @b_io_percent_values);
+    my $b_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $b_other_percent_avg, @b_other_percent_values);
+    my $d_elapsed_stddev = &calculateStandardDeviation($number_of_tests, $d_elapsed_avg, @d_elapsed_values);
+    my $d_io_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_io_percent_avg, @d_io_percent_values);
+    my $d_other_percent_stddev = &calculateStandardDeviation($number_of_tests, $d_other_percent_avg, @d_other_percent_values);
+    my $counter = 0;
+    my $outlier_count = 0;
+    foreach my $test_record (@test_records)
+    {
+      my ($the_collection, $the_test_run, $test_started, $docs_found, $docs_processed, $b_elapsed, $b_io, $b_other, $i_elapsed, $i_io, $i_other) = @{$test_record};
+      $counter++;
+      my $b_io_percent = ($b_io / $b_elapsed) * 100;
+      my $b_other_percent = ($b_other / $b_elapsed) * 100;
+      my $d_elapsed = $i_elapsed - $b_elapsed;
+      my $d_io = $i_io - $b_io;
+      my $d_io_percent = ($d_io / $d_elapsed) * 100;
+      my $d_other = $i_other - $b_other;
+      my $d_other_percent = ($d_other / $d_elapsed) * 100;
+      print '|  ' . $counter . ' |  ' . $the_collection . '  |  ' . $docs_processed . ' |  ' . $size_in_bytes . ' |  ' . $test_started . ' |  ' . sprintf('%0.6f', $b_elapsed) . ' |  ' . sprintf('%0.1f', $b_io_percent) . '% |  ' . sprintf('%0.1f', $b_other_percent) . '% |  ' . sprintf('%0.6f', $d_elapsed) . ' |  ' . sprintf('%0.1f', $d_io_percent) . '% |  ' . sprintf('%0.1f', $d_other_percent) . '% |  ';
+      if (&isOutlier($d_io_percent, $d_io_percent_avg, $d_io_percent_stddev))
+      {
+        print 'Yes';
+        $outlier_count++;
+      }
+      else
+      {
+        print 'No';
+      }
+      print "  |\n";
+    }
+    print '^  Average |||||  ' . sprintf('%0.6f', $b_elapsed_avg) . ' |  ' . sprintf('%0.1f', $b_io_percent_avg) . '% |  ' . sprintf('%0.1f', $b_other_percent_avg) . '% |  ' .  sprintf('%0.6f', $d_elapsed_avg) . ' |  ' . sprintf('%0.1f', $d_io_percent_avg) . '% |  ' . sprintf('%0.1f', $d_other_percent_avg) . "% |\n";
+    print '^  Standard Deviation |||||  ' . sprintf('%0.6f', $b_elapsed_stddev) . ' |  ' . sprintf('%0.1f', $b_io_percent_stddev) . '% |  ' . sprintf('%0.1f', $b_other_percent_stddev) . '% |  ' . sprintf('%0.6f', $d_elapsed_stddev) . ' |  ' . sprintf('%0.1f', $d_io_percent_stddev) . '% |  ' . sprintf('%0.1f', $d_other_percent_stddev) . '% |  ' . $outlier_count . " |\n";
+    print "\n";
+  }
+}
+## printReport()
+## @function printReportHeader($collection)
+#
+sub printReportHeader
+{
+  my ($collection) = @_;
+  print '==== "' . $collection . "\" Collection ====\n\n";
+  print "^ Count  ^ Collection  ^ NumDocs  ^ Size  ^ Timestamp  ^  Base                    ^^^  Import                 ^^^ Outlier?  ^\n";
+  print "^ :::    ^ :::         ^ :::      ^ :::   ^ :::        ^ Elapsed  ^  Syscalls      ^^ Elapsed  ^  Syscalls     ^^ :::       ^\n";
+  print "^ :::    ^ :::         ^ :::      ^ :::   ^ :::        ^ :::      ^ I/O%  ^ Other%  ^ :::      ^ I/O%  ^ Other  ^ :::       ^\n";
+}
+## printReportHeader()
+## @function calculateStandardDeviation($mean, $value1 ... $valueN)
+#  Our sample size is the population so we can use the simplier standard deviation equation for an entire population (now, theres some rap lyrics)
+#  stddev = squareroot ( sum ( square( value - mean ) ) / count )
+sub calculateStandardDeviation
+{
+  my $population = shift(@_);
+  my $mean = shift(@_);
+  my $count = scalar(@_);
+  my $total = 0;
+  # sum of squares
+  foreach my $value (@_)
+  {
+    my $subtotal = $value - $mean;
+    $total += $subtotal * $subtotal;
+  }
+  if ($population == $count)
+  {
+    $total = $total / $count;
+  }
+  else
+  {
+    $total = $total / ($count - 1);
+  }
+  my $standard_deviation = sqrt($total);
+  return $standard_deviation;
+}
+## calculateStandardDeviation()
+## @function isOutlier($value, $mean, $stddev)
+#
+sub isOutlier
+{
+  my ($value, $mean, $stddev) = @_;
+  my $min = $mean - (2 * $stddev);
+  my $max = $mean + (2 * $stddev);
+  return ($value <= $min || $max <= $value);
+}
+## isOutlier()
 ;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 28767 for gs2-extensions/parallel-building

Legend:

gs2-extensions/parallel-building/trunk/src/bin/script/import_with_io_metric.pl

Download in other formats: